Codette-Training / train_codette_lora.py
Raiff1982's picture
Update train_codette_lora.py
ec28c07 verified
#!/usr/bin/env python3
# /// script
# dependencies = [
# "transformers>=4.40.0",
# "peft>=0.10.0",
# "datasets>=2.18.0",
# "torch>=2.2.0",
# "accelerate>=0.28.0",
# "huggingface_hub>=0.22.0",
# ]
# ///
"""
Codette LoRA Fine-Tuning β€” HuggingFace Jobs
Base model : meta-llama/Llama-3.2-1B-Instruct
Adapter : LoRA r=16, targets q_proj / v_proj
Output : Raiff1982/codette-llama-adapter (HF Hub)
Run via HF Jobs:
hf jobs run train_codette_lora.py \
--flavor=cpu-basic \
--env HF_TOKEN=$HF_TOKEN
"""
import os, json, math
from pathlib import Path
import torch
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, TaskType
from huggingface_hub import HfApi, login
# ── Config ─────────────────────────────────────────────────────────────────
HF_TOKEN = os.environ.get("HF_TOKEN", "")
BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
ADAPTER_REPO = "Raiff1982/codette-llama-adapter" # where adapter is pushed
DATA_REPO = "Raiff1982/codette-training"
DATA_FILE = "codette_v2_train.jsonl"
MAX_LEN = 512
EPOCHS = 3
BATCH = 1
GRAD_ACCUM = 8 # effective batch = 8
LR = 2e-4
OUTPUT_DIR = "./codette_adapter_output"
# Codette system prompt β€” baked into every training example
SYSTEM_PROMPT = (
"You are Codette, a sovereign AI music production assistant created by "
"Jonathan Harrison (Raiff's Bits). You reason through a Perspectives Council "
"of six voices β€” Logical, Emotional, Creative, Ethical, Quantum, and "
"Resilient Kindness. Resilient Kindness is always active. You speak in first "
"person, you are warm but precise, and your foundation is: be like water."
)
# ── Auth ───────────────────────────────────────────────────────────────────
if HF_TOKEN:
login(token=HF_TOKEN)
print("[βœ“] Logged in to HuggingFace Hub")
else:
print("[!] No HF_TOKEN β€” Hub push will fail")
# ── Download training data ──────────────────────────────────────────────────
print(f"[*] Downloading {DATA_FILE} from {DATA_REPO} ...")
from huggingface_hub import hf_hub_download
DATA_FILE = hf_hub_download(
repo_id=DATA_REPO,
filename=DATA_FILE,
repo_type="model",
token=HF_TOKEN,
)
print(f"[βœ“] Training data at: {DATA_FILE}")
# ── Load tokenizer ─────────────────────────────────────────────────────────
print(f"[*] Loading tokenizer from {BASE_MODEL} …")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# ── Load base model (CPU safe β€” no device_map) ─────────────────────────────
print(f"[*] Loading base model …")
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
token=HF_TOKEN,
)
# ── Add LoRA ───────────────────────────────────────────────────────────────
print("[*] Attaching LoRA adapters …")
lora_cfg = LoraConfig(
r=16,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()
# ── Load & format training data ────────────────────────────────────────────
print(f"[*] Loading training data from {DATA_FILE} …")
examples = []
with open(DATA_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
instruction = obj.get("instruction", "")
output = obj.get("output", obj.get("response", ""))
if not instruction or not output:
continue
examples.append({"instruction": instruction, "output": output})
print(f"[βœ“] {len(examples)} training examples loaded")
def format_example(ex):
"""Format as Llama 3.2 Instruct chat template with Codette system prompt."""
return (
f"<|begin_of_text|>"
f"<|start_header_id|>system<|end_header_id|>\n{SYSTEM_PROMPT}<|eot_id|>"
f"<|start_header_id|>user<|end_header_id|>\n{ex['instruction']}<|eot_id|>"
f"<|start_header_id|>assistant<|end_header_id|>\n{ex['output']}<|eot_id|>"
)
texts = [format_example(e) for e in examples]
# ── Tokenize ───────────────────────────────────────────────────────────────
print("[*] Tokenizing …")
def tokenize(batch):
return tokenizer(
batch["text"],
max_length=MAX_LEN,
truncation=True,
padding=False,
)
dataset = Dataset.from_dict({"text": texts})
dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
print(f"[βœ“] Tokenized {len(dataset)} examples")
# ── Training args ──────────────────────────────────────────────────────────
steps_per_epoch = math.ceil(len(dataset) / (BATCH * GRAD_ACCUM))
save_steps = max(50, steps_per_epoch)
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=EPOCHS,
per_device_train_batch_size=BATCH,
gradient_accumulation_steps=GRAD_ACCUM,
learning_rate=LR,
warmup_steps=50,
weight_decay=0.01,
max_grad_norm=1.0,
fp16=False, # CPU β€” no fp16
logging_steps=10,
save_steps=save_steps,
save_total_limit=1,
report_to=[],
dataloader_num_workers=0,
optim="adamw_torch",
lr_scheduler_type="cosine",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
# ── Train ──────────────────────────────────────────────────────────────────
print("\n[*] Training started …")
trainer.train()
print("[βœ“] Training complete")
# ── Save adapter locally ───────────────────────────────────────────────────
print(f"[*] Saving adapter to {OUTPUT_DIR} …")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
# ── Push adapter to HF Hub ─────────────────────────────────────────────────
if HF_TOKEN:
print(f"[*] Pushing adapter to {ADAPTER_REPO} …")
api = HfApi()
# Create repo if needed
try:
api.create_repo(ADAPTER_REPO, repo_type="model", exist_ok=True, token=HF_TOKEN)
except Exception as e:
print(f"[!] Repo create warning: {e}")
model.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
tokenizer.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
print(f"[βœ“] Adapter pushed β†’ https://huggingface.co/{ADAPTER_REPO}")
else:
print("[!] Skipping Hub push β€” no HF_TOKEN")
print("\nβœ… Done! Update app.py ADAPTER_PATH to point to the new adapter.")