A Coding Guide on LLM Post Training with TRL from Supervised Fine Tuning to DPO and GRPO Reasoning

import subprocess, sys
subprocess.check_call([sys.executable, “-m”, “pip”, “install”, “-q”, “-U”,
“torchao>=0.16”,
“trl>=0.20”,
“transformers>=4.45”,
“datasets”,
“peft>=0.13”,
“accelerate”,
“bitsandbytes”,
])

import sys as _sys
for _m in [m for m in list(_sys.modules) if m.startswith((“torchao”, “peft”))]:
_sys.modules.pop(_m, None)
try:
import torchao
except Exception:
import types
_fake = types.ModuleType(“torchao”)
_fake.__version__ = “0.16.1”
_sys.modules[“torchao”] = _fake

import os, re, gc, torch, warnings
warnings.filterwarnings(“ignore”)
os.environ[“TOKENIZERS_PARALLELISM”] = “false”
os.environ[“WANDB_DISABLED”] = “true”
os.environ[“HF_HUB_DISABLE_PROGRESS_BARS”] = “1”

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig

print(f”torch={torch.__version__} cuda={torch.cuda.is_available()}”)
if torch.cuda.is_available():
print(f”GPU: {torch.cuda.get_device_name(0)} ”
f”({torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB)”)

MODEL_NAME = “Qwen/Qwen2.5-0.5B-Instruct”
DEVICE = “cuda” if torch.cuda.is_available() else “cpu”
BF16_OK = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

LORA_CFG = LoraConfig(
r=8, lora_alpha=16, lora_dropout=0.05, bias=”none”,
target_modules=[“q_proj”, “k_proj”, “v_proj”, “o_proj”],
task_type=”CAUSAL_LM”,
)

def cleanup():
“””Release VRAM between training stages (Colab T4 is tight).”””
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()

def chat_generate(model, tokenizer, prompt, max_new_tokens=120):
“””Helper: format as chat, generate, decode just the assistant turn.”””
msgs = [{“role”: “user”, “content”: prompt}]
ids = tokenizer.apply_chat_template(
msgs, return_tensors=”pt”, add_generation_prompt=True
).to(model.device)
with torch.no_grad():
out = model.generate(
ids, max_new_tokens=max_new_tokens,
do_sample=True, temperature=0.7, top_p=0.9,
pad_token_id=tokenizer.eos_token_id,
)
return tokenizer.decode(out[0][ids.shape[-1]:], skip_special_tokens=True)

Source link