File size: 8,161 Bytes
94c147f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec28c07
94c147f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python3
# /// script
# dependencies = [
#   "transformers>=4.40.0",
#   "peft>=0.10.0",
#   "datasets>=2.18.0",
#   "torch>=2.2.0",
#   "accelerate>=0.28.0",
#   "huggingface_hub>=0.22.0",
# ]
# ///
"""
Codette LoRA Fine-Tuning β€” HuggingFace Jobs
Base model : meta-llama/Llama-3.2-1B-Instruct
Adapter    : LoRA r=16, targets q_proj / v_proj
Output     : Raiff1982/codette-llama-adapter (HF Hub)

Run via HF Jobs:
  hf jobs run train_codette_lora.py \
    --flavor=cpu-basic \
    --env HF_TOKEN=$HF_TOKEN
"""

import os, json, math
from pathlib import Path

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, TaskType
from huggingface_hub import HfApi, login

# ── Config ─────────────────────────────────────────────────────────────────
HF_TOKEN      = os.environ.get("HF_TOKEN", "")
BASE_MODEL    = "meta-llama/Llama-3.2-1B-Instruct"
ADAPTER_REPO  = "Raiff1982/codette-llama-adapter"   # where adapter is pushed
DATA_REPO     = "Raiff1982/codette-training"
DATA_FILE     = "codette_v2_train.jsonl"
MAX_LEN       = 512
EPOCHS        = 3
BATCH         = 1
GRAD_ACCUM    = 8                                     # effective batch = 8
LR            = 2e-4
OUTPUT_DIR    = "./codette_adapter_output"

# Codette system prompt β€” baked into every training example
SYSTEM_PROMPT = (
    "You are Codette, a sovereign AI music production assistant created by "
    "Jonathan Harrison (Raiff's Bits). You reason through a Perspectives Council "
    "of six voices β€” Logical, Emotional, Creative, Ethical, Quantum, and "
    "Resilient Kindness. Resilient Kindness is always active. You speak in first "
    "person, you are warm but precise, and your foundation is: be like water."
)

# ── Auth ───────────────────────────────────────────────────────────────────
if HF_TOKEN:
    login(token=HF_TOKEN)
    print("[βœ“] Logged in to HuggingFace Hub")
else:
    print("[!] No HF_TOKEN β€” Hub push will fail")

# ── Download training data ──────────────────────────────────────────────────
print(f"[*] Downloading {DATA_FILE} from {DATA_REPO} ...")
from huggingface_hub import hf_hub_download
DATA_FILE = hf_hub_download(
    repo_id=DATA_REPO,
    filename=DATA_FILE,
    repo_type="model",
    token=HF_TOKEN,
)
print(f"[βœ“] Training data at: {DATA_FILE}")

# ── Load tokenizer ─────────────────────────────────────────────────────────
print(f"[*] Loading tokenizer from {BASE_MODEL} …")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# ── Load base model (CPU safe β€” no device_map) ─────────────────────────────
print(f"[*] Loading base model …")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True,
    token=HF_TOKEN,
)

# ── Add LoRA ───────────────────────────────────────────────────────────────
print("[*] Attaching LoRA adapters …")
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

# ── Load & format training data ────────────────────────────────────────────
print(f"[*] Loading training data from {DATA_FILE} …")
examples = []
with open(DATA_FILE, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        instruction = obj.get("instruction", "")
        output      = obj.get("output", obj.get("response", ""))
        if not instruction or not output:
            continue
        examples.append({"instruction": instruction, "output": output})

print(f"[βœ“] {len(examples)} training examples loaded")

def format_example(ex):
    """Format as Llama 3.2 Instruct chat template with Codette system prompt."""
    return (
        f"<|begin_of_text|>"
        f"<|start_header_id|>system<|end_header_id|>\n{SYSTEM_PROMPT}<|eot_id|>"
        f"<|start_header_id|>user<|end_header_id|>\n{ex['instruction']}<|eot_id|>"
        f"<|start_header_id|>assistant<|end_header_id|>\n{ex['output']}<|eot_id|>"
    )

texts = [format_example(e) for e in examples]

# ── Tokenize ───────────────────────────────────────────────────────────────
print("[*] Tokenizing …")
def tokenize(batch):
    return tokenizer(
        batch["text"],
        max_length=MAX_LEN,
        truncation=True,
        padding=False,
    )

dataset = Dataset.from_dict({"text": texts})
dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
print(f"[βœ“] Tokenized {len(dataset)} examples")

# ── Training args ──────────────────────────────────────────────────────────
steps_per_epoch = math.ceil(len(dataset) / (BATCH * GRAD_ACCUM))
save_steps      = max(50, steps_per_epoch)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    warmup_steps=50,
    weight_decay=0.01,
    max_grad_norm=1.0,
    fp16=False,                        # CPU β€” no fp16
    logging_steps=10,
    save_steps=save_steps,
    save_total_limit=1,
    report_to=[],
    dataloader_num_workers=0,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# ── Train ──────────────────────────────────────────────────────────────────
print("\n[*] Training started …")
trainer.train()
print("[βœ“] Training complete")

# ── Save adapter locally ───────────────────────────────────────────────────
print(f"[*] Saving adapter to {OUTPUT_DIR} …")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# ── Push adapter to HF Hub ─────────────────────────────────────────────────
if HF_TOKEN:
    print(f"[*] Pushing adapter to {ADAPTER_REPO} …")
    api = HfApi()
    # Create repo if needed
    try:
        api.create_repo(ADAPTER_REPO, repo_type="model", exist_ok=True, token=HF_TOKEN)
    except Exception as e:
        print(f"[!] Repo create warning: {e}")

    model.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
    tokenizer.push_to_hub(ADAPTER_REPO, token=HF_TOKEN)
    print(f"[βœ“] Adapter pushed β†’ https://huggingface.co/{ADAPTER_REPO}")
else:
    print("[!] Skipping Hub push β€” no HF_TOKEN")

print("\nβœ… Done! Update app.py ADAPTER_PATH to point to the new adapter.")