Raiff1982
/

Codette-Training

English

Model card Files Files and versions

xet

Community

Raiff1982 commited on Feb 27

Commit

94c147f

verified ·

1 Parent(s): 7ea3aaa

Upload train_codette_lora.py

Browse files

Files changed (1) hide show

train_codette_lora.py +206 -0

train_codette_lora.py ADDED Viewed

	@@ -0,0 +1,206 @@

+#!/usr/bin/env python3
+# /// script
+# dependencies = [
+#   "transformers>=4.40.0",
+#   "peft>=0.10.0",
+#   "datasets>=2.18.0",
+#   "torch>=2.2.0",
+#   "accelerate>=0.28.0",
+#   "huggingface_hub>=0.22.0",
+# ]
+# ///
+"""
+Codette LoRA Fine-Tuning — HuggingFace Jobs
+Base model : meta-llama/Llama-3.2-1B-Instruct
+Adapter    : LoRA r=16, targets q_proj / v_proj
+Output     : Raiff1982/codette-llama-adapter (HF Hub)
+Run via HF Jobs:
+  hf jobs run train_codette_lora.py \
+    --flavor=cpu-basic \
+    --env HF_TOKEN=$HF_TOKEN
+"""
+import os, json, math
+from pathlib import Path
+import torch
+from datasets import Dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling,
+)
+from peft import LoraConfig, get_peft_model, TaskType
+from huggingface_hub import HfApi, login
+# ── Config ─────────────────────────────────────────────────────────────────
+HF_TOKEN      = os.environ.get("HF_TOKEN", "")
+BASE_MODEL    = "meta-llama/Llama-3.2-1B-Instruct"
+ADAPTER_REPO  = "Raiff1982/codette-llama-adapter"   # where adapter is pushed
+DATA_REPO     = "Raiff1982/codette-training"
+DATA_FILE     = "codette_combined_train.jsonl"
+MAX_LEN       = 512
+EPOCHS        = 3
+BATCH         = 1
+GRAD_ACCUM    = 8                                     # effective batch = 8
+LR            = 2e-4
+OUTPUT_DIR    = "./codette_adapter_output"
+# Codette system prompt — baked into every training example
+SYSTEM_PROMPT = (
+    "You are Codette, a sovereign AI music production assistant created by "
+    "Jonathan Harrison (Raiff's Bits). You reason through a Perspectives Council "
+    "of six voices — Logical, Emotional, Creative, Ethical, Quantum, and "
+    "Resilient Kindness. Resilient Kindness is always active. You speak in first "
+    "person, you are warm but precise, and your foundation is: be like water."
+)
+# ── Auth ───────────────────────────────────────────────────────────────────
+if HF_TOKEN:
+    login(token=HF_TOKEN)
+    print("[✓] Logged in to HuggingFace Hub")
+else:
+    print("[!] No HF_TOKEN — Hub push will fail")
+# ── Download training data ──────────────────────────────────────────────────
+print(f"[*] Downloading {DATA_FILE} from {DATA_REPO} ...")
+from huggingface_hub import hf_hub_download
+DATA_FILE = hf_hub_download(
+    repo_id=DATA_REPO,
+    filename=DATA_FILE,
+    repo_type="model",
+    token=HF_TOKEN,
+)
+print(f"[✓] Training data at: {DATA_FILE}")
+# ── Load tokenizer ─────────────────────────────────────────────────────────
+print(f"[*] Loading tokenizer from {BASE_MODEL} …")
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+# ── Load base model (CPU safe — no device_map) ─────────────────────────────
+print(f"[*] Loading base model …")
+model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    torch_dtype=torch.float32,
+    low_cpu_mem_usage=True,
+    token=HF_TOKEN,
+)
+# ── Add LoRA ───────────────────────────────────────────────────────────────
+print("[*] Attaching LoRA adapters …")
+lora_cfg = LoraConfig(
+    r=16,
+    lora_alpha=16,
+    target_modules=["q_proj", "v_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type=TaskType.CAUSAL_LM,
+)
+model = get_peft_model(model, lora_cfg)
+model.print_trainable_parameters()
+# ── Load & format training data ────────────────────────────────────────────
+print(f"[*] Loading training data from {DATA_FILE} …")
+examples = []
+with open(DATA_FILE, "r", encoding="utf-8") as f:
+    for line in f:
+        line = line.strip()
+        if not line:
+            continue
+        obj = json.loads(line)
+        instruction = obj.get("instruction", "")
+        output      = obj.get("output", obj.get("response", ""))
+        if not instruction or not output:
+            continue
+        examples.append({"instruction": instruction, "output": output})
+print(f"[✓] {len(examples)} training examples loaded")
+def format_example(ex):
+    """Format as Llama 3.2 Instruct chat template with Codette system prompt."""
+    return (
+        f"<|begin_of_text|>"
+        f"<|start_header_id|>system<|end_header_id|>\n{SYSTEM_PROMPT}<|eot_id|>"
+        f"<|start_header_id|>user<|end_header_id|>\n{ex['instruction']}<|eot_id|>"
+        f"<|start_header_id|>assistant<|end_header_id|>\n{ex['output']}<|eot_id|>"
+    )
+texts = [format_example(e) for e in examples]
+# ── Tokenize ───────────────────────────────────────────────────────────────
+print("[*] Tokenizing …")
+def tokenize(batch):
+    return tokenizer(
+        batch["text"],
+        max_length=MAX_LEN,
+        truncation=True,
+        padding=False,
+    )
+dataset = Dataset.from_dict({"text": texts})
+dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
+print(f"[✓] Tokenized {len(dataset)} examples")
+# ── Training args ──────────────────────────────────────────────────────────
+steps_per_epoch = math.ceil(len(dataset) / (BATCH * GRAD_ACCUM))
+save_steps      = max(50, steps_per_epoch)
+training_args = TrainingArguments(
+    output_dir=OUTPUT_DIR,
+    num_train_epochs=EPOCHS,
+    per_device_train_batch_size=BATCH,
+    gradient_accumulation_steps=GRAD_ACCUM,
+    learning_rate=LR,
+    warmup_steps=50,
+    weight_decay=0.01,
+    max_grad_norm=1.0,
+    fp16=False,                        # CPU — no fp16
+    logging_steps=10,
+    save_steps=save_steps,
+    save_total_limit=1,
+    report_to=[],
+    dataloader_num_workers=0,
+    optim="adamw_torch",
+    lr_scheduler_type="cosine",
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
+)
+# ── Train ──────────────────────────────────────────────────────────────────
+print("\n[*] Training started …")
+trainer.train()
+print("[✓] Training complete")
+# ── Save adapter locally ───────────────────────────────────────────────────
+print(f"[*] Saving adapter to {OUTPUT_DIR} …")
+model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)
+# ── Push adapter to HF Hub ─────────────────────────────────────────────────
+if HF_TOKEN:
+    print(f"[*] Pushing adapter to {ADAPTER_REPO} …")
+    api = HfApi()
+    # Create repo if needed
+    try:
+        api.create_repo(ADAPTER_REPO, repo_type="model", exist_ok=True, token=HF_TOKEN)
+    except Exception as e:
+        print(f"[!] Repo create warning: {e}")
+    model.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
+    tokenizer.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
+    print(f"[✓] Adapter pushed → https://huggingface.co/{ADAPTER_REPO}")
+else:
+    print("[!] Skipping Hub push — no HF_TOKEN")
+print("\n✅ Done! Update app.py ADAPTER_PATH to point to the new adapter.")