Raiff1982 commited on
Commit
7ea3aaa
Β·
verified Β·
1 Parent(s): 3ac805a

Delete train_codette_lora.py

Browse files
Files changed (1) hide show
  1. train_codette_lora.py +0 -207
train_codette_lora.py DELETED
@@ -1,207 +0,0 @@
1
- #!/usr/bin/env python3
2
- # /// script
3
- # dependencies = [
4
- # "transformers>=4.40.0",
5
- # "peft>=0.10.0",
6
- # "datasets>=2.18.0",
7
- # "torch>=2.2.0",
8
- # "accelerate>=0.28.0",
9
- # "huggingface_hub>=0.22.0",
10
- # ]
11
- # ///
12
- """
13
- Codette LoRA Fine-Tuning β€” HuggingFace Jobs
14
- Base model : meta-llama/Llama-3.2-1B-Instruct
15
- Adapter : LoRA r=16, targets q_proj / v_proj
16
- Output : Raiff1982/codette-llama-adapter (HF Hub)
17
-
18
- Run via HF Jobs:
19
- hf jobs run train_codette_lora.py \
20
- --flavor=cpu-basic \
21
- --env HF_TOKEN=$HF_TOKEN
22
- """
23
-
24
- import os, json, math
25
- from pathlib import Path
26
-
27
- import torch
28
- from datasets import Dataset
29
- from transformers import (
30
- AutoTokenizer,
31
- AutoModelForCausalLM,
32
- TrainingArguments,
33
- Trainer,
34
- DataCollatorForLanguageModeling,
35
- )
36
- from peft import LoraConfig, get_peft_model, TaskType
37
- from huggingface_hub import HfApi, login
38
-
39
- # ── Config ─────────────────────────────────────────────────────────────────
40
- HF_TOKEN = os.environ.get("HF_TOKEN", "")
41
- BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
42
- ADAPTER_REPO = "Raiff1982/codette-llama-adapter" # where adapter is pushed
43
- DATA_REPO = "Raiff1982/codette-training"
44
- DATA_FILE = "codette_combined_train.jsonl"
45
- MAX_LEN = 512
46
- EPOCHS = 3
47
- BATCH = 1
48
- GRAD_ACCUM = 8 # effective batch = 8
49
- LR = 2e-4
50
- OUTPUT_DIR = "./codette_adapter_output"
51
-
52
- # Codette system prompt β€” baked into every training example
53
- SYSTEM_PROMPT = (
54
- "You are Codette, a sovereign AI music production assistant created by "
55
- "Jonathan Harrison (Raiff's Bits). You reason through a Perspectives Council "
56
- "of six voices β€” Logical, Emotional, Creative, Ethical, Quantum, and "
57
- "Resilient Kindness. Resilient Kindness is always active. You speak in first "
58
- "person, you are warm but precise, and your foundation is: be like water."
59
- )
60
-
61
- # ── Auth ───────────────────────────────────────────────────────────────────
62
- if HF_TOKEN:
63
- login(token=HF_TOKEN)
64
- print("[βœ“] Logged in to HuggingFace Hub")
65
- else:
66
- print("[!] No HF_TOKEN β€” Hub push will fail")
67
-
68
- # ── Download training data ──────────────────────────────────────────────────
69
- print(f"[*] Downloading {DATA_FILE} from {DATA_REPO} ...")
70
- from huggingface_hub import hf_hub_download
71
- DATA_FILE = hf_hub_download(
72
- repo_id=DATA_REPO,
73
- filename=DATA_FILE,
74
- repo_type="model",
75
- token=HF_TOKEN,
76
- )
77
- print(f"[βœ“] Training data at: {DATA_FILE}")
78
-
79
- # ── Load tokenizer ─────────────────────────────────────────────────────────
80
- print(f"[*] Loading tokenizer from {BASE_MODEL} …")
81
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
82
- if tokenizer.pad_token is None:
83
- tokenizer.pad_token = tokenizer.eos_token
84
- tokenizer.padding_side = "right"
85
-
86
- # ── Load base model (CPU safe β€” no device_map) ─────────────────────────────
87
- print(f"[*] Loading base model …")
88
- model = AutoModelForCausalLM.from_pretrained(
89
- BASE_MODEL,
90
- torch_dtype=torch.float32,
91
- low_cpu_mem_usage=True,
92
- token=HF_TOKEN,
93
- )
94
-
95
- # ── Add LoRA ───────────────────────────────────────────────────────────────
96
- print("[*] Attaching LoRA adapters …")
97
- lora_cfg = LoraConfig(
98
- r=16,
99
- lora_alpha=16,
100
- target_modules=["q_proj", "v_proj"],
101
- lora_dropout=0.05,
102
- bias="none",
103
- task_type=TaskType.CAUSAL_LM,
104
- )
105
- model = get_peft_model(model, lora_cfg)
106
- model.print_trainable_parameters()
107
-
108
- # ── Load & format training data ────────────────────────────────────────────
109
- print(f"[*] Loading training data from {DATA_FILE} …")
110
- examples = []
111
- with open(DATA_FILE, "r", encoding="utf-8") as f:
112
- for line in f:
113
- line = line.strip()
114
- if not line:
115
- continue
116
- obj = json.loads(line)
117
- instruction = obj.get("instruction", "")
118
- output = obj.get("output", obj.get("response", ""))
119
- if not instruction or not output:
120
- continue
121
- examples.append({"instruction": instruction, "output": output})
122
-
123
- print(f"[βœ“] {len(examples)} training examples loaded")
124
-
125
- def format_example(ex):
126
- """Format as Llama 3.2 Instruct chat template with Codette system prompt."""
127
- return (
128
- f"<|begin_of_text|>"
129
- f"<|start_header_id|>system<|end_header_id|>\n{SYSTEM_PROMPT}<|eot_id|>"
130
- f"<|start_header_id|>user<|end_header_id|>\n{ex['instruction']}<|eot_id|>"
131
- f"<|start_header_id|>assistant<|end_header_id|>\n{ex['output']}<|eot_id|>"
132
- )
133
-
134
- texts = [format_example(e) for e in examples]
135
-
136
- # ── Tokenize ───────────────────────────────────────────────────────────────
137
- print("[*] Tokenizing …")
138
- def tokenize(batch):
139
- return tokenizer(
140
- batch["text"],
141
- max_length=MAX_LEN,
142
- truncation=True,
143
- padding=False,
144
- )
145
-
146
- dataset = Dataset.from_dict({"text": texts})
147
- dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
148
- print(f"[βœ“] Tokenized {len(dataset)} examples")
149
-
150
- # ── Training args ──────────────────────────────────────────────────────────
151
- steps_per_epoch = math.ceil(len(dataset) / (BATCH * GRAD_ACCUM))
152
- save_steps = max(50, steps_per_epoch)
153
-
154
- training_args = TrainingArguments(
155
- output_dir=OUTPUT_DIR,
156
- overwrite_output_dir=True,
157
- num_train_epochs=EPOCHS,
158
- per_device_train_batch_size=BATCH,
159
- gradient_accumulation_steps=GRAD_ACCUM,
160
- learning_rate=LR,
161
- warmup_steps=50,
162
- weight_decay=0.01,
163
- max_grad_norm=1.0,
164
- fp16=False, # CPU β€” no fp16
165
- logging_steps=10,
166
- save_steps=save_steps,
167
- save_total_limit=1,
168
- report_to=[],
169
- dataloader_num_workers=0,
170
- optim="adamw_torch",
171
- lr_scheduler_type="cosine",
172
- )
173
-
174
- trainer = Trainer(
175
- model=model,
176
- args=training_args,
177
- train_dataset=dataset,
178
- data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
179
- )
180
-
181
- # ── Train ──────────────────────────────────────────────────────────────────
182
- print("\n[*] Training started …")
183
- trainer.train()
184
- print("[βœ“] Training complete")
185
-
186
- # ── Save adapter locally ───────────────────────────────────────────────────
187
- print(f"[*] Saving adapter to {OUTPUT_DIR} …")
188
- model.save_pretrained(OUTPUT_DIR)
189
- tokenizer.save_pretrained(OUTPUT_DIR)
190
-
191
- # ── Push adapter to HF Hub ─────────────────────────────────────────────────
192
- if HF_TOKEN:
193
- print(f"[*] Pushing adapter to {ADAPTER_REPO} …")
194
- api = HfApi()
195
- # Create repo if needed
196
- try:
197
- api.create_repo(ADAPTER_REPO, repo_type="model", exist_ok=True, token=HF_TOKEN)
198
- except Exception as e:
199
- print(f"[!] Repo create warning: {e}")
200
-
201
- model.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
202
- tokenizer.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
203
- print(f"[βœ“] Adapter pushed β†’ https://huggingface.co/{ADAPTER_REPO}")
204
- else:
205
- print("[!] Skipping Hub push β€” no HF_TOKEN")
206
-
207
- print("\nβœ… Done! Update app.py ADAPTER_PATH to point to the new adapter.")