Raiff1982
/

Codette-Training

English

Model card Files Files and versions

xet

Community

Raiff1982 commited on Feb 27

Commit

7ea3aaa

verified ·

1 Parent(s): 3ac805a

Delete train_codette_lora.py

Browse files

Files changed (1) hide show

train_codette_lora.py +0 -207

train_codette_lora.py DELETED Viewed

@@ -1,207 +0,0 @@
-#!/usr/bin/env python3
-# /// script
-# dependencies = [
-#   "transformers>=4.40.0",
-#   "peft>=0.10.0",
-#   "datasets>=2.18.0",
-#   "torch>=2.2.0",
-#   "accelerate>=0.28.0",
-#   "huggingface_hub>=0.22.0",
-# ]
-# ///
-"""
-Codette LoRA Fine-Tuning — HuggingFace Jobs
-Base model : meta-llama/Llama-3.2-1B-Instruct
-Adapter    : LoRA r=16, targets q_proj / v_proj
-Output     : Raiff1982/codette-llama-adapter (HF Hub)
-Run via HF Jobs:
-  hf jobs run train_codette_lora.py \
-    --flavor=cpu-basic \
-    --env HF_TOKEN=$HF_TOKEN
-"""
-import os, json, math
-from pathlib import Path
-import torch
-from datasets import Dataset
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    TrainingArguments,
-    Trainer,
-    DataCollatorForLanguageModeling,
-)
-from peft import LoraConfig, get_peft_model, TaskType
-from huggingface_hub import HfApi, login
-# ── Config ─────────────────────────────────────────────────────────────────
-HF_TOKEN      = os.environ.get("HF_TOKEN", "")
-BASE_MODEL    = "meta-llama/Llama-3.2-1B-Instruct"
-ADAPTER_REPO  = "Raiff1982/codette-llama-adapter"   # where adapter is pushed
-DATA_REPO     = "Raiff1982/codette-training"
-DATA_FILE     = "codette_combined_train.jsonl"
-MAX_LEN       = 512
-EPOCHS        = 3
-BATCH         = 1
-GRAD_ACCUM    = 8                                     # effective batch = 8
-LR            = 2e-4
-OUTPUT_DIR    = "./codette_adapter_output"
-# Codette system prompt — baked into every training example
-SYSTEM_PROMPT = (
-    "You are Codette, a sovereign AI music production assistant created by "
-    "Jonathan Harrison (Raiff's Bits). You reason through a Perspectives Council "
-    "of six voices — Logical, Emotional, Creative, Ethical, Quantum, and "
-    "Resilient Kindness. Resilient Kindness is always active. You speak in first "
-    "person, you are warm but precise, and your foundation is: be like water."
-)
-# ── Auth ───────────────────────────────────────────────────────────────────
-if HF_TOKEN:
-    login(token=HF_TOKEN)
-    print("[✓] Logged in to HuggingFace Hub")
-else:
-    print("[!] No HF_TOKEN — Hub push will fail")
-# ── Download training data ──────────────────────────────────────────────────
-print(f"[*] Downloading {DATA_FILE} from {DATA_REPO} ...")
-from huggingface_hub import hf_hub_download
-DATA_FILE = hf_hub_download(
-    repo_id=DATA_REPO,
-    filename=DATA_FILE,
-    repo_type="model",
-    token=HF_TOKEN,
-)
-print(f"[✓] Training data at: {DATA_FILE}")
-# ── Load tokenizer ─────────────────────────────────────────────────────────
-print(f"[*] Loading tokenizer from {BASE_MODEL} …")
-tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "right"
-# ── Load base model (CPU safe — no device_map) ─────────────────────────────
-print(f"[*] Loading base model …")
-model = AutoModelForCausalLM.from_pretrained(
-    BASE_MODEL,
-    torch_dtype=torch.float32,
-    low_cpu_mem_usage=True,
-    token=HF_TOKEN,
-)
-# ── Add LoRA ───────────────────────────────────────────────────────────────
-print("[*] Attaching LoRA adapters …")
-lora_cfg = LoraConfig(
-    r=16,
-    lora_alpha=16,
-    target_modules=["q_proj", "v_proj"],
-    lora_dropout=0.05,
-    bias="none",
-    task_type=TaskType.CAUSAL_LM,
-)
-model = get_peft_model(model, lora_cfg)
-model.print_trainable_parameters()
-# ── Load & format training data ────────────────────────────────────────────
-print(f"[*] Loading training data from {DATA_FILE} …")
-examples = []
-with open(DATA_FILE, "r", encoding="utf-8") as f:
-    for line in f:
-        line = line.strip()
-        if not line:
-            continue
-        obj = json.loads(line)
-        instruction = obj.get("instruction", "")
-        output      = obj.get("output", obj.get("response", ""))
-        if not instruction or not output:
-            continue
-        examples.append({"instruction": instruction, "output": output})
-print(f"[✓] {len(examples)} training examples loaded")
-def format_example(ex):
-    """Format as Llama 3.2 Instruct chat template with Codette system prompt."""
-    return (
-        f"<|begin_of_text|>"
-        f"<|start_header_id|>system<|end_header_id|>\n{SYSTEM_PROMPT}<|eot_id|>"
-        f"<|start_header_id|>user<|end_header_id|>\n{ex['instruction']}<|eot_id|>"
-        f"<|start_header_id|>assistant<|end_header_id|>\n{ex['output']}<|eot_id|>"
-    )
-texts = [format_example(e) for e in examples]
-# ── Tokenize ───────────────────────────────────────────────────────────────
-print("[*] Tokenizing …")
-def tokenize(batch):
-    return tokenizer(
-        batch["text"],
-        max_length=MAX_LEN,
-        truncation=True,
-        padding=False,
-    )
-dataset = Dataset.from_dict({"text": texts})
-dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
-print(f"[✓] Tokenized {len(dataset)} examples")
-# ── Training args ──────────────────────────────────────────────────────────
-steps_per_epoch = math.ceil(len(dataset) / (BATCH * GRAD_ACCUM))
-save_steps      = max(50, steps_per_epoch)
-training_args = TrainingArguments(
-    output_dir=OUTPUT_DIR,
-    overwrite_output_dir=True,
-    num_train_epochs=EPOCHS,
-    per_device_train_batch_size=BATCH,
-    gradient_accumulation_steps=GRAD_ACCUM,
-    learning_rate=LR,
-    warmup_steps=50,
-    weight_decay=0.01,
-    max_grad_norm=1.0,
-    fp16=False,                        # CPU — no fp16
-    logging_steps=10,
-    save_steps=save_steps,
-    save_total_limit=1,
-    report_to=[],
-    dataloader_num_workers=0,
-    optim="adamw_torch",
-    lr_scheduler_type="cosine",
-)
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=dataset,
-    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
-)
-# ── Train ──────────────────────────────────────────────────────────────────
-print("\n[*] Training started …")
-trainer.train()
-print("[✓] Training complete")
-# ── Save adapter locally ───────────────────────────────────────────────────
-print(f"[*] Saving adapter to {OUTPUT_DIR} …")
-model.save_pretrained(OUTPUT_DIR)
-tokenizer.save_pretrained(OUTPUT_DIR)
-# ── Push adapter to HF Hub ─────────────────────────────────────────────────
-if HF_TOKEN:
-    print(f"[*] Pushing adapter to {ADAPTER_REPO} …")
-    api = HfApi()
-    # Create repo if needed
-    try:
-        api.create_repo(ADAPTER_REPO, repo_type="model", exist_ok=True, token=HF_TOKEN)
-    except Exception as e:
-        print(f"[!] Repo create warning: {e}")
-    model.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
-    tokenizer.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
-    print(f"[✓] Adapter pushed → https://huggingface.co/{ADAPTER_REPO}")
-else:
-    print("[!] Skipping Hub push — no HF_TOKEN")
-print("\n✅ Done! Update app.py ADAPTER_PATH to point to the new adapter.")