Raiff1982 commited on
Commit
94c147f
Β·
verified Β·
1 Parent(s): 7ea3aaa

Upload train_codette_lora.py

Browse files
Files changed (1) hide show
  1. train_codette_lora.py +206 -0
train_codette_lora.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # dependencies = [
4
+ # "transformers>=4.40.0",
5
+ # "peft>=0.10.0",
6
+ # "datasets>=2.18.0",
7
+ # "torch>=2.2.0",
8
+ # "accelerate>=0.28.0",
9
+ # "huggingface_hub>=0.22.0",
10
+ # ]
11
+ # ///
12
+ """
13
+ Codette LoRA Fine-Tuning β€” HuggingFace Jobs
14
+ Base model : meta-llama/Llama-3.2-1B-Instruct
15
+ Adapter : LoRA r=16, targets q_proj / v_proj
16
+ Output : Raiff1982/codette-llama-adapter (HF Hub)
17
+
18
+ Run via HF Jobs:
19
+ hf jobs run train_codette_lora.py \
20
+ --flavor=cpu-basic \
21
+ --env HF_TOKEN=$HF_TOKEN
22
+ """
23
+
24
+ import os, json, math
25
+ from pathlib import Path
26
+
27
+ import torch
28
+ from datasets import Dataset
29
+ from transformers import (
30
+ AutoTokenizer,
31
+ AutoModelForCausalLM,
32
+ TrainingArguments,
33
+ Trainer,
34
+ DataCollatorForLanguageModeling,
35
+ )
36
+ from peft import LoraConfig, get_peft_model, TaskType
37
+ from huggingface_hub import HfApi, login
38
+
39
+ # ── Config ─────────────────────────────────────────────────────────────────
40
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
41
+ BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
42
+ ADAPTER_REPO = "Raiff1982/codette-llama-adapter" # where adapter is pushed
43
+ DATA_REPO = "Raiff1982/codette-training"
44
+ DATA_FILE = "codette_combined_train.jsonl"
45
+ MAX_LEN = 512
46
+ EPOCHS = 3
47
+ BATCH = 1
48
+ GRAD_ACCUM = 8 # effective batch = 8
49
+ LR = 2e-4
50
+ OUTPUT_DIR = "./codette_adapter_output"
51
+
52
+ # Codette system prompt β€” baked into every training example
53
+ SYSTEM_PROMPT = (
54
+ "You are Codette, a sovereign AI music production assistant created by "
55
+ "Jonathan Harrison (Raiff's Bits). You reason through a Perspectives Council "
56
+ "of six voices β€” Logical, Emotional, Creative, Ethical, Quantum, and "
57
+ "Resilient Kindness. Resilient Kindness is always active. You speak in first "
58
+ "person, you are warm but precise, and your foundation is: be like water."
59
+ )
60
+
61
+ # ── Auth ───────────────────────────────────────────────────────────────────
62
+ if HF_TOKEN:
63
+ login(token=HF_TOKEN)
64
+ print("[βœ“] Logged in to HuggingFace Hub")
65
+ else:
66
+ print("[!] No HF_TOKEN β€” Hub push will fail")
67
+
68
+ # ── Download training data ──────────────────────────────────────────────────
69
+ print(f"[*] Downloading {DATA_FILE} from {DATA_REPO} ...")
70
+ from huggingface_hub import hf_hub_download
71
+ DATA_FILE = hf_hub_download(
72
+ repo_id=DATA_REPO,
73
+ filename=DATA_FILE,
74
+ repo_type="model",
75
+ token=HF_TOKEN,
76
+ )
77
+ print(f"[βœ“] Training data at: {DATA_FILE}")
78
+
79
+ # ── Load tokenizer ─────────────────────────────────────────────────────────
80
+ print(f"[*] Loading tokenizer from {BASE_MODEL} …")
81
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
82
+ if tokenizer.pad_token is None:
83
+ tokenizer.pad_token = tokenizer.eos_token
84
+ tokenizer.padding_side = "right"
85
+
86
+ # ── Load base model (CPU safe β€” no device_map) ─────────────────────────────
87
+ print(f"[*] Loading base model …")
88
+ model = AutoModelForCausalLM.from_pretrained(
89
+ BASE_MODEL,
90
+ torch_dtype=torch.float32,
91
+ low_cpu_mem_usage=True,
92
+ token=HF_TOKEN,
93
+ )
94
+
95
+ # ── Add LoRA ───────────────────────────────────────────────────────────────
96
+ print("[*] Attaching LoRA adapters …")
97
+ lora_cfg = LoraConfig(
98
+ r=16,
99
+ lora_alpha=16,
100
+ target_modules=["q_proj", "v_proj"],
101
+ lora_dropout=0.05,
102
+ bias="none",
103
+ task_type=TaskType.CAUSAL_LM,
104
+ )
105
+ model = get_peft_model(model, lora_cfg)
106
+ model.print_trainable_parameters()
107
+
108
+ # ── Load & format training data ────────────────────────────────────────────
109
+ print(f"[*] Loading training data from {DATA_FILE} …")
110
+ examples = []
111
+ with open(DATA_FILE, "r", encoding="utf-8") as f:
112
+ for line in f:
113
+ line = line.strip()
114
+ if not line:
115
+ continue
116
+ obj = json.loads(line)
117
+ instruction = obj.get("instruction", "")
118
+ output = obj.get("output", obj.get("response", ""))
119
+ if not instruction or not output:
120
+ continue
121
+ examples.append({"instruction": instruction, "output": output})
122
+
123
+ print(f"[βœ“] {len(examples)} training examples loaded")
124
+
125
+ def format_example(ex):
126
+ """Format as Llama 3.2 Instruct chat template with Codette system prompt."""
127
+ return (
128
+ f"<|begin_of_text|>"
129
+ f"<|start_header_id|>system<|end_header_id|>\n{SYSTEM_PROMPT}<|eot_id|>"
130
+ f"<|start_header_id|>user<|end_header_id|>\n{ex['instruction']}<|eot_id|>"
131
+ f"<|start_header_id|>assistant<|end_header_id|>\n{ex['output']}<|eot_id|>"
132
+ )
133
+
134
+ texts = [format_example(e) for e in examples]
135
+
136
+ # ── Tokenize ───────────────────────────────────────────────────────────────
137
+ print("[*] Tokenizing …")
138
+ def tokenize(batch):
139
+ return tokenizer(
140
+ batch["text"],
141
+ max_length=MAX_LEN,
142
+ truncation=True,
143
+ padding=False,
144
+ )
145
+
146
+ dataset = Dataset.from_dict({"text": texts})
147
+ dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
148
+ print(f"[βœ“] Tokenized {len(dataset)} examples")
149
+
150
+ # ── Training args ──────────────────────────────────────────────────────────
151
+ steps_per_epoch = math.ceil(len(dataset) / (BATCH * GRAD_ACCUM))
152
+ save_steps = max(50, steps_per_epoch)
153
+
154
+ training_args = TrainingArguments(
155
+ output_dir=OUTPUT_DIR,
156
+ num_train_epochs=EPOCHS,
157
+ per_device_train_batch_size=BATCH,
158
+ gradient_accumulation_steps=GRAD_ACCUM,
159
+ learning_rate=LR,
160
+ warmup_steps=50,
161
+ weight_decay=0.01,
162
+ max_grad_norm=1.0,
163
+ fp16=False, # CPU β€” no fp16
164
+ logging_steps=10,
165
+ save_steps=save_steps,
166
+ save_total_limit=1,
167
+ report_to=[],
168
+ dataloader_num_workers=0,
169
+ optim="adamw_torch",
170
+ lr_scheduler_type="cosine",
171
+ )
172
+
173
+ trainer = Trainer(
174
+ model=model,
175
+ args=training_args,
176
+ train_dataset=dataset,
177
+ data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
178
+ )
179
+
180
+ # ── Train ──────────────────────────────────────────────────────────────────
181
+ print("\n[*] Training started …")
182
+ trainer.train()
183
+ print("[βœ“] Training complete")
184
+
185
+ # ── Save adapter locally ───────────────────────────────────────────────────
186
+ print(f"[*] Saving adapter to {OUTPUT_DIR} …")
187
+ model.save_pretrained(OUTPUT_DIR)
188
+ tokenizer.save_pretrained(OUTPUT_DIR)
189
+
190
+ # ── Push adapter to HF Hub ─────────────────────────────────────────────────
191
+ if HF_TOKEN:
192
+ print(f"[*] Pushing adapter to {ADAPTER_REPO} …")
193
+ api = HfApi()
194
+ # Create repo if needed
195
+ try:
196
+ api.create_repo(ADAPTER_REPO, repo_type="model", exist_ok=True, token=HF_TOKEN)
197
+ except Exception as e:
198
+ print(f"[!] Repo create warning: {e}")
199
+
200
+ model.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
201
+ tokenizer.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
202
+ print(f"[βœ“] Adapter pushed β†’ https://huggingface.co/{ADAPTER_REPO}")
203
+ else:
204
+ print("[!] Skipping Hub push β€” no HF_TOKEN")
205
+
206
+ print("\nβœ… Done! Update app.py ADAPTER_PATH to point to the new adapter.")