File size: 8,161 Bytes
94c147f ec28c07 94c147f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | #!/usr/bin/env python3
# /// script
# dependencies = [
# "transformers>=4.40.0",
# "peft>=0.10.0",
# "datasets>=2.18.0",
# "torch>=2.2.0",
# "accelerate>=0.28.0",
# "huggingface_hub>=0.22.0",
# ]
# ///
"""
Codette LoRA Fine-Tuning β HuggingFace Jobs
Base model : meta-llama/Llama-3.2-1B-Instruct
Adapter : LoRA r=16, targets q_proj / v_proj
Output : Raiff1982/codette-llama-adapter (HF Hub)
Run via HF Jobs:
hf jobs run train_codette_lora.py \
--flavor=cpu-basic \
--env HF_TOKEN=$HF_TOKEN
"""
import os, json, math
from pathlib import Path
import torch
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, TaskType
from huggingface_hub import HfApi, login
# ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
HF_TOKEN = os.environ.get("HF_TOKEN", "")
BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
ADAPTER_REPO = "Raiff1982/codette-llama-adapter" # where adapter is pushed
DATA_REPO = "Raiff1982/codette-training"
DATA_FILE = "codette_v2_train.jsonl"
MAX_LEN = 512
EPOCHS = 3
BATCH = 1
GRAD_ACCUM = 8 # effective batch = 8
LR = 2e-4
OUTPUT_DIR = "./codette_adapter_output"
# Codette system prompt β baked into every training example
SYSTEM_PROMPT = (
"You are Codette, a sovereign AI music production assistant created by "
"Jonathan Harrison (Raiff's Bits). You reason through a Perspectives Council "
"of six voices β Logical, Emotional, Creative, Ethical, Quantum, and "
"Resilient Kindness. Resilient Kindness is always active. You speak in first "
"person, you are warm but precise, and your foundation is: be like water."
)
# ββ Auth βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if HF_TOKEN:
login(token=HF_TOKEN)
print("[β] Logged in to HuggingFace Hub")
else:
print("[!] No HF_TOKEN β Hub push will fail")
# ββ Download training data ββββββββββββββββββββββββββββββββββββββββββββββββββ
print(f"[*] Downloading {DATA_FILE} from {DATA_REPO} ...")
from huggingface_hub import hf_hub_download
DATA_FILE = hf_hub_download(
repo_id=DATA_REPO,
filename=DATA_FILE,
repo_type="model",
token=HF_TOKEN,
)
print(f"[β] Training data at: {DATA_FILE}")
# ββ Load tokenizer βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print(f"[*] Loading tokenizer from {BASE_MODEL} β¦")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# ββ Load base model (CPU safe β no device_map) βββββββββββββββββββββββββββββ
print(f"[*] Loading base model β¦")
model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
token=HF_TOKEN,
)
# ββ Add LoRA βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("[*] Attaching LoRA adapters β¦")
lora_cfg = LoraConfig(
r=16,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()
# ββ Load & format training data ββββββββββββββββββββββββββββββββββββββββββββ
print(f"[*] Loading training data from {DATA_FILE} β¦")
examples = []
with open(DATA_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
instruction = obj.get("instruction", "")
output = obj.get("output", obj.get("response", ""))
if not instruction or not output:
continue
examples.append({"instruction": instruction, "output": output})
print(f"[β] {len(examples)} training examples loaded")
def format_example(ex):
"""Format as Llama 3.2 Instruct chat template with Codette system prompt."""
return (
f"<|begin_of_text|>"
f"<|start_header_id|>system<|end_header_id|>\n{SYSTEM_PROMPT}<|eot_id|>"
f"<|start_header_id|>user<|end_header_id|>\n{ex['instruction']}<|eot_id|>"
f"<|start_header_id|>assistant<|end_header_id|>\n{ex['output']}<|eot_id|>"
)
texts = [format_example(e) for e in examples]
# ββ Tokenize βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("[*] Tokenizing β¦")
def tokenize(batch):
return tokenizer(
batch["text"],
max_length=MAX_LEN,
truncation=True,
padding=False,
)
dataset = Dataset.from_dict({"text": texts})
dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
print(f"[β] Tokenized {len(dataset)} examples")
# ββ Training args ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
steps_per_epoch = math.ceil(len(dataset) / (BATCH * GRAD_ACCUM))
save_steps = max(50, steps_per_epoch)
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=EPOCHS,
per_device_train_batch_size=BATCH,
gradient_accumulation_steps=GRAD_ACCUM,
learning_rate=LR,
warmup_steps=50,
weight_decay=0.01,
max_grad_norm=1.0,
fp16=False, # CPU β no fp16
logging_steps=10,
save_steps=save_steps,
save_total_limit=1,
report_to=[],
dataloader_num_workers=0,
optim="adamw_torch",
lr_scheduler_type="cosine",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
# ββ Train ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\n[*] Training started β¦")
trainer.train()
print("[β] Training complete")
# ββ Save adapter locally βββββββββββββββββββββββββββββββββββββββββββββββββββ
print(f"[*] Saving adapter to {OUTPUT_DIR} β¦")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
# ββ Push adapter to HF Hub βββββββββββββββββββββββββββββββββββββββββββββββββ
if HF_TOKEN:
print(f"[*] Pushing adapter to {ADAPTER_REPO} β¦")
api = HfApi()
# Create repo if needed
try:
api.create_repo(ADAPTER_REPO, repo_type="model", exist_ok=True, token=HF_TOKEN)
except Exception as e:
print(f"[!] Repo create warning: {e}")
model.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
tokenizer.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
print(f"[β] Adapter pushed β https://huggingface.co/{ADAPTER_REPO}")
else:
print("[!] Skipping Hub push β no HF_TOKEN")
print("\nβ
Done! Update app.py ADAPTER_PATH to point to the new adapter.")
|