| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """ |
| | Codette LoRA Fine-Tuning β HuggingFace Jobs |
| | Base model : meta-llama/Llama-3.2-1B-Instruct |
| | Adapter : LoRA r=16, targets q_proj / v_proj |
| | Output : Raiff1982/codette-llama-adapter (HF Hub) |
| | |
| | Run via HF Jobs: |
| | hf jobs run train_codette_lora.py \ |
| | --flavor=cpu-basic \ |
| | --env HF_TOKEN=$HF_TOKEN |
| | """ |
| |
|
| | import os, json, math |
| | from pathlib import Path |
| |
|
| | import torch |
| | from datasets import Dataset |
| | from transformers import ( |
| | AutoTokenizer, |
| | AutoModelForCausalLM, |
| | TrainingArguments, |
| | Trainer, |
| | DataCollatorForLanguageModeling, |
| | ) |
| | from peft import LoraConfig, get_peft_model, TaskType |
| | from huggingface_hub import HfApi, login |
| |
|
| | |
| | HF_TOKEN = os.environ.get("HF_TOKEN", "") |
| | BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct" |
| | ADAPTER_REPO = "Raiff1982/codette-llama-adapter" |
| | DATA_REPO = "Raiff1982/codette-training" |
| | DATA_FILE = "codette_v2_train.jsonl" |
| | MAX_LEN = 512 |
| | EPOCHS = 3 |
| | BATCH = 1 |
| | GRAD_ACCUM = 8 |
| | LR = 2e-4 |
| | OUTPUT_DIR = "./codette_adapter_output" |
| |
|
| | |
| | SYSTEM_PROMPT = ( |
| | "You are Codette, a sovereign AI music production assistant created by " |
| | "Jonathan Harrison (Raiff's Bits). You reason through a Perspectives Council " |
| | "of six voices β Logical, Emotional, Creative, Ethical, Quantum, and " |
| | "Resilient Kindness. Resilient Kindness is always active. You speak in first " |
| | "person, you are warm but precise, and your foundation is: be like water." |
| | ) |
| |
|
| | |
| | if HF_TOKEN: |
| | login(token=HF_TOKEN) |
| | print("[β] Logged in to HuggingFace Hub") |
| | else: |
| | print("[!] No HF_TOKEN β Hub push will fail") |
| |
|
| | |
| | print(f"[*] Downloading {DATA_FILE} from {DATA_REPO} ...") |
| | from huggingface_hub import hf_hub_download |
| | DATA_FILE = hf_hub_download( |
| | repo_id=DATA_REPO, |
| | filename=DATA_FILE, |
| | repo_type="model", |
| | token=HF_TOKEN, |
| | ) |
| | print(f"[β] Training data at: {DATA_FILE}") |
| |
|
| | |
| | print(f"[*] Loading tokenizer from {BASE_MODEL} β¦") |
| | tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN) |
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| | tokenizer.padding_side = "right" |
| |
|
| | |
| | print(f"[*] Loading base model β¦") |
| | model = AutoModelForCausalLM.from_pretrained( |
| | BASE_MODEL, |
| | torch_dtype=torch.float32, |
| | low_cpu_mem_usage=True, |
| | token=HF_TOKEN, |
| | ) |
| |
|
| | |
| | print("[*] Attaching LoRA adapters β¦") |
| | lora_cfg = LoraConfig( |
| | r=16, |
| | lora_alpha=16, |
| | target_modules=["q_proj", "v_proj"], |
| | lora_dropout=0.05, |
| | bias="none", |
| | task_type=TaskType.CAUSAL_LM, |
| | ) |
| | model = get_peft_model(model, lora_cfg) |
| | model.print_trainable_parameters() |
| |
|
| | |
| | print(f"[*] Loading training data from {DATA_FILE} β¦") |
| | examples = [] |
| | with open(DATA_FILE, "r", encoding="utf-8") as f: |
| | for line in f: |
| | line = line.strip() |
| | if not line: |
| | continue |
| | obj = json.loads(line) |
| | instruction = obj.get("instruction", "") |
| | output = obj.get("output", obj.get("response", "")) |
| | if not instruction or not output: |
| | continue |
| | examples.append({"instruction": instruction, "output": output}) |
| |
|
| | print(f"[β] {len(examples)} training examples loaded") |
| |
|
| | def format_example(ex): |
| | """Format as Llama 3.2 Instruct chat template with Codette system prompt.""" |
| | return ( |
| | f"<|begin_of_text|>" |
| | f"<|start_header_id|>system<|end_header_id|>\n{SYSTEM_PROMPT}<|eot_id|>" |
| | f"<|start_header_id|>user<|end_header_id|>\n{ex['instruction']}<|eot_id|>" |
| | f"<|start_header_id|>assistant<|end_header_id|>\n{ex['output']}<|eot_id|>" |
| | ) |
| |
|
| | texts = [format_example(e) for e in examples] |
| |
|
| | |
| | print("[*] Tokenizing β¦") |
| | def tokenize(batch): |
| | return tokenizer( |
| | batch["text"], |
| | max_length=MAX_LEN, |
| | truncation=True, |
| | padding=False, |
| | ) |
| |
|
| | dataset = Dataset.from_dict({"text": texts}) |
| | dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) |
| | print(f"[β] Tokenized {len(dataset)} examples") |
| |
|
| | |
| | steps_per_epoch = math.ceil(len(dataset) / (BATCH * GRAD_ACCUM)) |
| | save_steps = max(50, steps_per_epoch) |
| |
|
| | training_args = TrainingArguments( |
| | output_dir=OUTPUT_DIR, |
| | num_train_epochs=EPOCHS, |
| | per_device_train_batch_size=BATCH, |
| | gradient_accumulation_steps=GRAD_ACCUM, |
| | learning_rate=LR, |
| | warmup_steps=50, |
| | weight_decay=0.01, |
| | max_grad_norm=1.0, |
| | fp16=False, |
| | logging_steps=10, |
| | save_steps=save_steps, |
| | save_total_limit=1, |
| | report_to=[], |
| | dataloader_num_workers=0, |
| | optim="adamw_torch", |
| | lr_scheduler_type="cosine", |
| | ) |
| |
|
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=dataset, |
| | data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), |
| | ) |
| |
|
| | |
| | print("\n[*] Training started β¦") |
| | trainer.train() |
| | print("[β] Training complete") |
| |
|
| | |
| | print(f"[*] Saving adapter to {OUTPUT_DIR} β¦") |
| | model.save_pretrained(OUTPUT_DIR) |
| | tokenizer.save_pretrained(OUTPUT_DIR) |
| |
|
| | |
| | if HF_TOKEN: |
| | print(f"[*] Pushing adapter to {ADAPTER_REPO} β¦") |
| | api = HfApi() |
| | |
| | try: |
| | api.create_repo(ADAPTER_REPO, repo_type="model", exist_ok=True, token=HF_TOKEN) |
| | except Exception as e: |
| | print(f"[!] Repo create warning: {e}") |
| |
|
| | model.push_to_hub(ADAPTER_REPO, token=HF_TOKEN) |
| | tokenizer.push_to_hub(ADAPTER_REPO, token=HF_TOKEN) |
| | print(f"[β] Adapter pushed β https://huggingface.co/{ADAPTER_REPO}") |
| | else: |
| | print("[!] Skipping Hub push β no HF_TOKEN") |
| |
|
| | print("\nβ
Done! Update app.py ADAPTER_PATH to point to the new adapter.") |
| |
|