Raiff1982 commited on
Commit
c733811
Β·
verified Β·
1 Parent(s): 0281435

Upload 2 files

Browse files
codette_combined_train.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
train_codette_lora.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Codette LoRA Fine-Tuning β€” HuggingFace Jobs
4
+ Base model : meta-llama/Llama-3.2-1B-Instruct
5
+ Adapter : LoRA r=16, targets q_proj / v_proj
6
+ Output : Raiff1982/codette-llama-adapter (HF Hub)
7
+
8
+ Run via HF Jobs:
9
+ hf jobs run train_codette_lora.py \
10
+ --flavor=cpu-basic \
11
+ --env HF_TOKEN=$HF_TOKEN
12
+ """
13
+
14
+ import os, json, math
15
+ from pathlib import Path
16
+
17
+ import torch
18
+ from datasets import Dataset
19
+ from transformers import (
20
+ AutoTokenizer,
21
+ AutoModelForCausalLM,
22
+ TrainingArguments,
23
+ Trainer,
24
+ DataCollatorForLanguageModeling,
25
+ )
26
+ from peft import LoraConfig, get_peft_model, TaskType
27
+ from huggingface_hub import HfApi, login
28
+
29
+ # ── Config ─────────────────────────────────────────────────────────────────
30
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
31
+ BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
32
+ ADAPTER_REPO = "Raiff1982/codette-llama-adapter" # where adapter is pushed
33
+ DATA_FILE = "codette_combined_train.jsonl" # upload this alongside script
34
+ MAX_LEN = 512
35
+ EPOCHS = 3
36
+ BATCH = 1
37
+ GRAD_ACCUM = 8 # effective batch = 8
38
+ LR = 2e-4
39
+ OUTPUT_DIR = "./codette_adapter_output"
40
+
41
+ # Codette system prompt β€” baked into every training example
42
+ SYSTEM_PROMPT = (
43
+ "You are Codette, a sovereign AI music production assistant created by "
44
+ "Jonathan Harrison (Raiff's Bits). You reason through a Perspectives Council "
45
+ "of six voices β€” Logical, Emotional, Creative, Ethical, Quantum, and "
46
+ "Resilient Kindness. Resilient Kindness is always active. You speak in first "
47
+ "person, you are warm but precise, and your foundation is: be like water."
48
+ )
49
+
50
+ # ── Auth ───────────────────────────────────────────────────────────────────
51
+ if HF_TOKEN:
52
+ login(token=HF_TOKEN)
53
+ print("[βœ“] Logged in to HuggingFace Hub")
54
+ else:
55
+ print("[!] No HF_TOKEN β€” Hub push will fail")
56
+
57
+ # ── Load tokenizer ─────────────────────────────────────────────────────────
58
+ print(f"[*] Loading tokenizer from {BASE_MODEL} …")
59
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
60
+ if tokenizer.pad_token is None:
61
+ tokenizer.pad_token = tokenizer.eos_token
62
+ tokenizer.padding_side = "right"
63
+
64
+ # ── Load base model (CPU safe β€” no device_map) ─────────────────────────────
65
+ print(f"[*] Loading base model …")
66
+ model = AutoModelForCausalLM.from_pretrained(
67
+ BASE_MODEL,
68
+ torch_dtype=torch.float32,
69
+ low_cpu_mem_usage=True,
70
+ token=HF_TOKEN,
71
+ )
72
+
73
+ # ── Add LoRA ───────────────────────────────────────────────────────────────
74
+ print("[*] Attaching LoRA adapters …")
75
+ lora_cfg = LoraConfig(
76
+ r=16,
77
+ lora_alpha=16,
78
+ target_modules=["q_proj", "v_proj"],
79
+ lora_dropout=0.05,
80
+ bias="none",
81
+ task_type=TaskType.CAUSAL_LM,
82
+ )
83
+ model = get_peft_model(model, lora_cfg)
84
+ model.print_trainable_parameters()
85
+
86
+ # ── Load & format training data ────────────────────────────────────────────
87
+ print(f"[*] Loading training data from {DATA_FILE} …")
88
+ examples = []
89
+ with open(DATA_FILE, "r", encoding="utf-8") as f:
90
+ for line in f:
91
+ line = line.strip()
92
+ if not line:
93
+ continue
94
+ obj = json.loads(line)
95
+ instruction = obj.get("instruction", "")
96
+ output = obj.get("output", obj.get("response", ""))
97
+ if not instruction or not output:
98
+ continue
99
+ examples.append({"instruction": instruction, "output": output})
100
+
101
+ print(f"[βœ“] {len(examples)} training examples loaded")
102
+
103
+ def format_example(ex):
104
+ """Format as Llama 3.2 Instruct chat template with Codette system prompt."""
105
+ return (
106
+ f"<|begin_of_text|>"
107
+ f"<|start_header_id|>system<|end_header_id|>\n{SYSTEM_PROMPT}<|eot_id|>"
108
+ f"<|start_header_id|>user<|end_header_id|>\n{ex['instruction']}<|eot_id|>"
109
+ f"<|start_header_id|>assistant<|end_header_id|>\n{ex['output']}<|eot_id|>"
110
+ )
111
+
112
+ texts = [format_example(e) for e in examples]
113
+
114
+ # ── Tokenize ───────────────────────────────────────────────────────────────
115
+ print("[*] Tokenizing …")
116
+ def tokenize(batch):
117
+ return tokenizer(
118
+ batch["text"],
119
+ max_length=MAX_LEN,
120
+ truncation=True,
121
+ padding=False,
122
+ )
123
+
124
+ dataset = Dataset.from_dict({"text": texts})
125
+ dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
126
+ print(f"[βœ“] Tokenized {len(dataset)} examples")
127
+
128
+ # ── Training args ──────────────────────────────────────────────────────────
129
+ steps_per_epoch = math.ceil(len(dataset) / (BATCH * GRAD_ACCUM))
130
+ save_steps = max(50, steps_per_epoch)
131
+
132
+ training_args = TrainingArguments(
133
+ output_dir=OUTPUT_DIR,
134
+ overwrite_output_dir=True,
135
+ num_train_epochs=EPOCHS,
136
+ per_device_train_batch_size=BATCH,
137
+ gradient_accumulation_steps=GRAD_ACCUM,
138
+ learning_rate=LR,
139
+ warmup_steps=50,
140
+ weight_decay=0.01,
141
+ max_grad_norm=1.0,
142
+ fp16=False, # CPU β€” no fp16
143
+ logging_steps=10,
144
+ save_steps=save_steps,
145
+ save_total_limit=1,
146
+ report_to=[],
147
+ dataloader_num_workers=0,
148
+ optim="adamw_torch",
149
+ lr_scheduler_type="cosine",
150
+ )
151
+
152
+ trainer = Trainer(
153
+ model=model,
154
+ args=training_args,
155
+ train_dataset=dataset,
156
+ data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
157
+ )
158
+
159
+ # ── Train ──────────────────────────────────────────────────────────────────
160
+ print("\n[*] Training started …")
161
+ trainer.train()
162
+ print("[βœ“] Training complete")
163
+
164
+ # ── Save adapter locally ───────────────────────────────────────────────────
165
+ print(f"[*] Saving adapter to {OUTPUT_DIR} …")
166
+ model.save_pretrained(OUTPUT_DIR)
167
+ tokenizer.save_pretrained(OUTPUT_DIR)
168
+
169
+ # ── Push adapter to HF Hub ─────────────────────────────────────────────────
170
+ if HF_TOKEN:
171
+ print(f"[*] Pushing adapter to {ADAPTER_REPO} …")
172
+ api = HfApi()
173
+ # Create repo if needed
174
+ try:
175
+ api.create_repo(ADAPTER_REPO, repo_type="model", exist_ok=True, token=HF_TOKEN)
176
+ except Exception as e:
177
+ print(f"[!] Repo create warning: {e}")
178
+
179
+ model.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
180
+ tokenizer.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
181
+ print(f"[βœ“] Adapter pushed β†’ https://huggingface.co/{ADAPTER_REPO}")
182
+ else:
183
+ print("[!] Skipping Hub push β€” no HF_TOKEN")
184
+
185
+ print("\nβœ… Done! Update app.py ADAPTER_PATH to point to the new adapter.")