Fine-Tuning LLMs in 2026: A Practical Guide to Custom AI Models



Prompt engineering gets you 80% of the way. Fine-tuning gets you the last 20%—but it’s expensive and easy to mess up. Here’s when it makes sense and how to do it right.

AI visualization Photo by Google DeepMind on Unsplash

When to Fine-Tune (and When Not To)

Fine-tune when:

  • You need consistent output format (JSON, specific structure)
  • Domain vocabulary matters (legal, medical, your product)
  • Latency is critical (shorter prompts = faster inference)
  • You have high-quality training data (1000+ examples)

Don’t fine-tune when:

  • Prompt engineering works well enough
  • Your data is messy or limited
  • You need the model to learn new facts (use RAG instead)
  • You’re still iterating on the task definition

The Fine-Tuning Landscape in 2026

ProviderModelsBest For
OpenAIGPT-4o, GPT-4o-miniProduction, API simplicity
AnthropicClaude (via Amazon)Enterprise, safety-critical
GoogleGeminiMultimodal tasks
Open SourceLlama 3, Mistral, QwenControl, cost, privacy

OpenAI Fine-Tuning

Preparing Training Data

# training_data.py
import json

def prepare_openai_format(examples: list[dict]) -> list[dict]:
    """Convert examples to OpenAI's chat format."""
    formatted = []
    
    for ex in examples:
        formatted.append({
            "messages": [
                {"role": "system", "content": ex["system_prompt"]},
                {"role": "user", "content": ex["input"]},
                {"role": "assistant", "content": ex["output"]}
            ]
        })
    
    return formatted

# Example: Customer support classifier
examples = [
    {
        "system_prompt": "Classify customer messages into categories.",
        "input": "My order hasn't arrived and it's been 2 weeks",
        "output": '{"category": "shipping", "sentiment": "frustrated", "priority": "high"}'
    },
    {
        "system_prompt": "Classify customer messages into categories.",
        "input": "How do I change my password?",
        "output": '{"category": "account", "sentiment": "neutral", "priority": "low"}'
    },
    # ... 1000+ more examples
]

# Save as JSONL
with open("training.jsonl", "w") as f:
    for item in prepare_openai_format(examples):
        f.write(json.dumps(item) + "\n")

Starting the Fine-Tune

from openai import OpenAI

client = OpenAI()

# Upload training file
file = client.files.create(
    file=open("training.jsonl", "rb"),
    purpose="fine-tune"
)

# Start fine-tuning
job = client.fine_tuning.jobs.create(
    training_file=file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 3,
        "batch_size": 8,
        "learning_rate_multiplier": 1.0
    }
)

print(f"Job started: {job.id}")

Monitoring Progress

# Check status
job = client.fine_tuning.jobs.retrieve(job.id)
print(f"Status: {job.status}")

# List events
events = client.fine_tuning.jobs.list_events(job.id, limit=10)
for event in events.data:
    print(f"{event.created_at}: {event.message}")

Data processing Photo by Luke Chesser on Unsplash

LoRA: Efficient Fine-Tuning for Open Source

Full fine-tuning updates all model weights. LoRA (Low-Rank Adaptation) only trains small adapter layers—faster and cheaper.

Setting Up LoRA with PEFT

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
import torch

# Load base model
model_name = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,                    # Rank - higher = more capacity
    lora_alpha=32,           # Scaling factor
    lora_dropout=0.1,
    target_modules=[         # Which layers to adapt
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Output: trainable params: 20,971,520 || all params: 8,050,000,000 || trainable%: 0.26

QLoRA: When GPU Memory is Limited

from transformers import BitsAndBytesConfig

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load quantized model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Now LoRA trains on the quantized model
# 8B model fits on 24GB GPU instead of 80GB+

Training Loop

from transformers import TrainingArguments, Trainer
from datasets import load_dataset

# Load your dataset
dataset = load_dataset("json", data_files="training.jsonl")

def format_prompt(example):
    return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{example['system']}<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{example['input']}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
{example['output']}<|eot_id|>"""

# Tokenize
def tokenize(example):
    text = format_prompt(example)
    tokens = tokenizer(
        text,
        truncation=True,
        max_length=2048,
        padding="max_length"
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized = dataset.map(tokenize, remove_columns=dataset["train"].column_names)

# Training arguments
training_args = TrainingArguments(
    output_dir="./llama-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_ratio=0.1,
    logging_steps=10,
    save_strategy="epoch",
    bf16=True,
    optim="paged_adamw_32bit",
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
)

trainer.train()

# Save adapter weights only (small!)
model.save_pretrained("./llama-lora-adapter")

Evaluation Strategy

Fine-tuning without evaluation is flying blind.

from datasets import load_dataset
import numpy as np

def evaluate_model(model, tokenizer, test_data):
    results = []
    
    for example in test_data:
        # Generate
        inputs = tokenizer(example["input"], return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=256)
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Compare to expected
        expected = example["output"]
        
        # Custom metrics for your task
        results.append({
            "exact_match": generated.strip() == expected.strip(),
            "valid_json": is_valid_json(generated),
            "category_correct": extract_category(generated) == extract_category(expected),
        })
    
    # Aggregate
    return {
        "exact_match": np.mean([r["exact_match"] for r in results]),
        "valid_json": np.mean([r["valid_json"] for r in results]),
        "category_accuracy": np.mean([r["category_correct"] for r in results]),
    }

Common Pitfalls

1. Overfitting

# Signs: Training loss drops, eval loss rises
# Fix: Early stopping, more data, lower epochs
training_args = TrainingArguments(
    # ...
    evaluation_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

2. Catastrophic Forgetting

# Model forgets general knowledge
# Fix: Include diverse examples, not just task-specific
training_data = (
    task_specific_examples +  # Your use case
    general_chat_examples     # Keep general ability
)

3. Poor Data Quality

# Garbage in, garbage out
# Fix: Review and clean your data
def validate_example(example):
    # Check format
    if not example.get("input") or not example.get("output"):
        return False
    
    # Check output is valid
    try:
        json.loads(example["output"])
    except:
        return False
    
    # Check length
    if len(example["input"]) < 10:
        return False
    
    return True

clean_data = [ex for ex in raw_data if validate_example(ex)]

Deploying Fine-Tuned Models

OpenAI

# Just use the fine-tuned model ID
response = client.chat.completions.create(
    model="ft:gpt-4o-mini-2024-07-18:my-org::abc123",
    messages=[{"role": "user", "content": "Classify this..."}]
)

Open Source with vLLM

# Merge LoRA adapter with base model
python -m peft.merge_and_unload \
    --base_model meta-llama/Llama-3.1-8B-Instruct \
    --lora_path ./llama-lora-adapter \
    --output_dir ./llama-merged

# Serve with vLLM
vllm serve ./llama-merged \
    --port 8000 \
    --tensor-parallel-size 2

Cost Comparison (as of 2026)

ApproachTraining CostInference CostTime to Deploy
OpenAI Fine-Tune~$50-5002x base modelHours
LoRA on 8B~$20 (cloud GPU)Self-hostedHours
Full Fine-Tune 70B~$500-5000Self-hostedDays

Fine-tuning is powerful but not magic. Start with prompting, measure what’s missing, then fine-tune with quality data. Your evaluation suite matters more than your hyperparameters.

이 글이 도움이 되셨다면 공감 및 광고 클릭을 부탁드립니다 :)