eeshaAI commited on
Commit
ed28e0b
Β·
verified Β·
1 Parent(s): 3a0f51d

Update train_on_hf_spaces.py: better error handling, show_error=True

Browse files
Files changed (1) hide show
  1. train_on_hf_spaces.py +148 -102
train_on_hf_spaces.py CHANGED
@@ -19,6 +19,7 @@ import sys
19
  import json
20
  import time
21
  import traceback
 
22
  from typing import Generator
23
 
24
  import torch
@@ -64,7 +65,6 @@ class VideoTokenDataset(Dataset):
64
  item = self.data[idx]
65
  prompt = item["text_prompt"]
66
  tokens = item["video_tokens"][: self.max_tokens]
67
- # Pad to fixed length
68
  while len(tokens) < self.max_tokens:
69
  tokens.append(0)
70
  return {
@@ -83,26 +83,36 @@ def train(data_path: str = "tokenized_dataset.json") -> Generator[str, None, Non
83
  yield "πŸš€ Starting training pipeline...\n"
84
 
85
  # ── 1. Load tokenizer & model ──────────────────────────────────────────
86
- yield "πŸ“¦ Loading OLMo 2 1B Instruct tokenizer & model (this may take a few minutes)...\n"
87
 
88
  try:
89
  from transformers import AutoModelForCausalLM, AutoTokenizer
90
- except ImportError:
91
- yield "❌ transformers not installed. Adding to requirements...\n"
92
  raise
93
 
94
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
95
- if tokenizer.pad_token is None:
96
- tokenizer.pad_token = tokenizer.eos_token
 
 
 
 
 
 
97
 
98
  yield "πŸ“¦ Loading model in float32 on CPU (this takes ~2-3 min)...\n"
99
- model = AutoModelForCausalLM.from_pretrained(
100
- MODEL_NAME,
101
- trust_remote_code=True,
102
- torch_dtype=torch.float32,
103
- device_map="cpu",
104
- )
105
- yield f"βœ… Model loaded. Original vocab size: {len(tokenizer)}\n"
 
 
 
 
106
 
107
  # ── 2. Expand vocabulary ───────────────────────────────────────────────
108
  yield f"πŸ”€ Adding {CODEBOOK_SIZE} visual tokens + special tokens...\n"
@@ -116,90 +126,111 @@ def train(data_path: str = "tokenized_dataset.json") -> Generator[str, None, Non
116
 
117
  # ── 3. Apply LoRA ─────────────────────────────────────────────────────
118
  yield f"πŸ”§ Applying LoRA (r={LORA_R}, alpha={LORA_ALPHA})...\n"
119
- from peft import LoraConfig, get_peft_model, TaskType
120
-
121
- lora_config = LoraConfig(
122
- r=LORA_R,
123
- lora_alpha=LORA_ALPHA,
124
- target_modules=["q_proj", "v_proj"],
125
- lora_dropout=LORA_DROPOUT,
126
- bias="none",
127
- task_type=TaskType.CAUSAL_LM,
128
- )
129
- model = get_peft_model(model, lora_config)
130
- model.print_trainable_parameters()
131
- yield "βœ… LoRA applied.\n"
 
 
 
 
 
 
132
 
133
  # ── 4. Load dataset ───────────────────────────────────────────────────
134
  yield f"πŸ“Š Loading dataset from {data_path}...\n"
135
- dataset = VideoTokenDataset(data_path, max_tokens=256)
136
- dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
137
- total_steps = NUM_EPOCHS * len(dataloader)
138
- yield f"πŸ“Š {len(dataset)} samples Γ— {NUM_EPOCHS} epochs = {total_steps} steps\n"
 
 
 
 
 
139
 
140
  # ── 5. Train ──────────────────────────────────────────────────────────
141
  yield "πŸ”₯ Starting training loop...\n\n"
142
 
143
- optimizer = torch.optim.Adafactor(model.parameters(), lr=LEARNING_RATE, scale_parameter=True, relative_step=True, warmup_init=True)
144
  model.train()
145
 
146
  global_step = 0
147
  running_loss = 0.0
148
  start_time = time.time()
149
 
150
- for epoch in range(NUM_EPOCHS):
151
- epoch_loss = 0.0
152
- num_batches = 0
153
-
154
- for batch_idx, batch in enumerate(dataloader):
155
- prompt = batch["prompt"][0]
156
- video_tokens = batch["video_tokens"][0]
157
-
158
- # Format: <text_start> prompt <text_end> <video_start> tok1 tok2 ... <video_end>
159
- token_str = " ".join(f"<v_{t.item()}>" for t in video_tokens)
160
- text = f"Create a video of: {prompt} {VIDEO_START} {token_str} {VIDEO_END}"
161
-
162
- inputs = tokenizer(
163
- text,
164
- return_tensors="pt",
165
- truncation=True,
166
- max_length=MAX_SEQ_LEN,
167
- padding="max_length",
168
- )
169
-
170
- # Forward pass
171
- outputs = model(**inputs, labels=inputs["input_ids"])
172
- loss = outputs.loss / GRADIENT_ACCUMULATION
173
-
174
- # Backward pass
175
- loss.backward()
176
-
177
- if (batch_idx + 1) % GRADIENT_ACCUMULATION == 0 or (batch_idx + 1) == len(dataloader):
178
- torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
179
- optimizer.step()
180
- optimizer.zero_grad()
181
-
182
- global_step += 1
183
- batch_loss = loss.item() * GRADIENT_ACCUMULATION
184
- epoch_loss += batch_loss
185
- running_loss += batch_loss
186
- num_batches += 1
187
-
188
- elapsed = time.time() - start_time
189
- steps_per_sec = global_step / elapsed if elapsed > 0 else 0
190
-
191
- if batch_idx % LOG_EVERY == 0:
192
- msg = (
193
- f" Epoch {epoch + 1}/{NUM_EPOCHS} | "
194
- f"Step {batch_idx + 1}/{len(dataloader)} | "
195
- f"Loss: {batch_loss:.4f} | "
196
- f"Avg: {epoch_loss / num_batches:.4f} | "
197
- f"Speed: {steps_per_sec:.2f} steps/s\n"
198
  )
199
- yield msg
200
 
201
- avg_epoch_loss = epoch_loss / num_batches
202
- yield f"\nπŸ“ˆ Epoch {epoch + 1} complete. Avg Loss: {avg_epoch_loss:.4f}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  total_time = time.time() - start_time
205
  yield f"βœ… Training complete in {total_time:.0f}s ({total_time / 60:.1f} min)\n"
@@ -207,32 +238,47 @@ def train(data_path: str = "tokenized_dataset.json") -> Generator[str, None, Non
207
 
208
  # ── 6. Merge & push ──────────────────────────────────────────────────
209
  yield "πŸ”€ Merging LoRA weights back into base model...\n"
210
- model = model.merge_and_unload()
 
 
 
 
211
 
212
  yield "πŸ’Ύ Saving model locally...\n"
213
  save_dir = "./trained_model"
214
- model.save_pretrained(save_dir, safe_serialization=True)
215
- tokenizer.save_pretrained(save_dir)
 
 
 
 
 
 
216
 
217
  yield f"πŸš€ Pushing to {REPO_ID}...\n"
218
- from huggingface_hub import HfApi
219
-
220
- api = HfApi(token=HF_TOKEN)
221
-
222
- # Create model repo if it doesn't exist
223
  try:
224
- api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True, token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  except Exception as e:
226
- yield f"⚠️ Repo creation note: {e}\n"
227
-
228
- api.upload_folder(
229
- folder_path=save_dir,
230
- repo_id=REPO_ID,
231
- repo_type="model",
232
- commit_message=f"LoRA-trained OLMo 2 1B (r={LORA_R}, {NUM_EPOCHS} epochs)",
233
- )
234
- yield f"βœ… Model pushed to https://huggingface.co/{REPO_ID}\n"
235
- yield "\nπŸŽ‰ All done! The trained model is now available on HuggingFace.\n"
236
 
237
 
238
  # ---------------------------------------------------------------------------
 
19
  import json
20
  import time
21
  import traceback
22
+ import gc
23
  from typing import Generator
24
 
25
  import torch
 
65
  item = self.data[idx]
66
  prompt = item["text_prompt"]
67
  tokens = item["video_tokens"][: self.max_tokens]
 
68
  while len(tokens) < self.max_tokens:
69
  tokens.append(0)
70
  return {
 
83
  yield "πŸš€ Starting training pipeline...\n"
84
 
85
  # ── 1. Load tokenizer & model ──────────────────────────────────────────
86
+ yield "πŸ“¦ Loading OLMo 2 1B Instruct tokenizer...\n"
87
 
88
  try:
89
  from transformers import AutoModelForCausalLM, AutoTokenizer
90
+ except ImportError as e:
91
+ yield f"❌ transformers not installed: {e}\n"
92
  raise
93
 
94
+ try:
95
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
96
+ if tokenizer.pad_token is None:
97
+ tokenizer.pad_token = tokenizer.eos_token
98
+ yield f"βœ… Tokenizer loaded. Vocab size: {len(tokenizer)}\n"
99
+ except Exception as e:
100
+ yield f"❌ Failed to load tokenizer: {e}\n"
101
+ yield traceback.format_exc() + "\n"
102
+ raise
103
 
104
  yield "πŸ“¦ Loading model in float32 on CPU (this takes ~2-3 min)...\n"
105
+ try:
106
+ model = AutoModelForCausalLM.from_pretrained(
107
+ MODEL_NAME,
108
+ trust_remote_code=True,
109
+ torch_dtype=torch.float32,
110
+ )
111
+ yield f"βœ… Model loaded. Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M\n"
112
+ except Exception as e:
113
+ yield f"❌ Failed to load model: {e}\n"
114
+ yield traceback.format_exc() + "\n"
115
+ raise
116
 
117
  # ── 2. Expand vocabulary ───────────────────────────────────────────────
118
  yield f"πŸ”€ Adding {CODEBOOK_SIZE} visual tokens + special tokens...\n"
 
126
 
127
  # ── 3. Apply LoRA ─────────────────────────────────────────────────────
128
  yield f"πŸ”§ Applying LoRA (r={LORA_R}, alpha={LORA_ALPHA})...\n"
129
+ try:
130
+ from peft import LoraConfig, get_peft_model, TaskType
131
+
132
+ lora_config = LoraConfig(
133
+ r=LORA_R,
134
+ lora_alpha=LORA_ALPHA,
135
+ target_modules=["q_proj", "v_proj"],
136
+ lora_dropout=LORA_DROPOUT,
137
+ bias="none",
138
+ task_type=TaskType.CAUSAL_LM,
139
+ )
140
+ model = get_peft_model(model, lora_config)
141
+ trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
142
+ total = sum(p.numel() for p in model.parameters())
143
+ yield f"βœ… LoRA applied. Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)\n"
144
+ except Exception as e:
145
+ yield f"❌ Failed to apply LoRA: {e}\n"
146
+ yield traceback.format_exc() + "\n"
147
+ raise
148
 
149
  # ── 4. Load dataset ───────────────────────────────────────────────────
150
  yield f"πŸ“Š Loading dataset from {data_path}...\n"
151
+ try:
152
+ dataset = VideoTokenDataset(data_path, max_tokens=256)
153
+ dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
154
+ total_steps = NUM_EPOCHS * len(dataloader)
155
+ yield f"πŸ“Š {len(dataset)} samples Γ— {NUM_EPOCHS} epochs = {total_steps} steps\n"
156
+ except Exception as e:
157
+ yield f"❌ Failed to load dataset: {e}\n"
158
+ yield traceback.format_exc() + "\n"
159
+ raise
160
 
161
  # ── 5. Train ──────────────────────────────────────────────────────────
162
  yield "πŸ”₯ Starting training loop...\n\n"
163
 
164
+ optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
165
  model.train()
166
 
167
  global_step = 0
168
  running_loss = 0.0
169
  start_time = time.time()
170
 
171
+ try:
172
+ for epoch in range(NUM_EPOCHS):
173
+ epoch_loss = 0.0
174
+ num_batches = 0
175
+
176
+ for batch_idx, batch in enumerate(dataloader):
177
+ prompt = batch["prompt"][0]
178
+ video_tokens = batch["video_tokens"][0]
179
+
180
+ # Format training text
181
+ token_str = " ".join(f"<v_{t.item()}>" for t in video_tokens[:64]) # limit tokens for memory
182
+ text = f"Create a video of: {prompt} {VIDEO_START} {token_str} {VIDEO_END}"
183
+
184
+ inputs = tokenizer(
185
+ text,
186
+ return_tensors="pt",
187
+ truncation=True,
188
+ max_length=MAX_SEQ_LEN,
189
+ padding="max_length",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  )
 
191
 
192
+ # Forward pass
193
+ outputs = model(**inputs, labels=inputs["input_ids"])
194
+ loss = outputs.loss / GRADIENT_ACCUMULATION
195
+
196
+ # Backward pass
197
+ loss.backward()
198
+
199
+ if (batch_idx + 1) % GRADIENT_ACCUMULATION == 0 or (batch_idx + 1) == len(dataloader):
200
+ torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
201
+ optimizer.step()
202
+ optimizer.zero_grad()
203
+
204
+ global_step += 1
205
+ batch_loss = loss.item() * GRADIENT_ACCUMULATION
206
+ epoch_loss += batch_loss
207
+ running_loss += batch_loss
208
+ num_batches += 1
209
+
210
+ elapsed = time.time() - start_time
211
+ steps_per_sec = global_step / elapsed if elapsed > 0 else 0
212
+
213
+ if batch_idx % LOG_EVERY == 0:
214
+ msg = (
215
+ f" Epoch {epoch + 1}/{NUM_EPOCHS} | "
216
+ f"Step {batch_idx + 1}/{len(dataloader)} | "
217
+ f"Loss: {batch_loss:.4f} | "
218
+ f"Avg: {epoch_loss / num_batches:.4f} | "
219
+ f"Speed: {steps_per_sec:.2f} steps/s\n"
220
+ )
221
+ yield msg
222
+
223
+ # Free memory
224
+ del outputs, loss
225
+ gc.collect()
226
+
227
+ avg_epoch_loss = epoch_loss / num_batches
228
+ yield f"\nπŸ“ˆ Epoch {epoch + 1} complete. Avg Loss: {avg_epoch_loss:.4f}\n\n"
229
+
230
+ except Exception as e:
231
+ yield f"\n❌ Training error: {e}\n"
232
+ yield traceback.format_exc() + "\n"
233
+ raise
234
 
235
  total_time = time.time() - start_time
236
  yield f"βœ… Training complete in {total_time:.0f}s ({total_time / 60:.1f} min)\n"
 
238
 
239
  # ── 6. Merge & push ──────────────────────────────────────────────────
240
  yield "πŸ”€ Merging LoRA weights back into base model...\n"
241
+ try:
242
+ model = model.merge_and_unload()
243
+ yield "βœ… LoRA merged.\n"
244
+ except Exception as e:
245
+ yield f"⚠️ Merge note: {e}\n"
246
 
247
  yield "πŸ’Ύ Saving model locally...\n"
248
  save_dir = "./trained_model"
249
+ try:
250
+ model.save_pretrained(save_dir, safe_serialization=True)
251
+ tokenizer.save_pretrained(save_dir)
252
+ yield "βœ… Model saved locally.\n"
253
+ except Exception as e:
254
+ yield f"❌ Save failed: {e}\n"
255
+ yield traceback.format_exc() + "\n"
256
+ raise
257
 
258
  yield f"πŸš€ Pushing to {REPO_ID}...\n"
 
 
 
 
 
259
  try:
260
+ from huggingface_hub import HfApi
261
+
262
+ api = HfApi(token=HF_TOKEN)
263
+
264
+ # Create model repo if it doesn't exist
265
+ try:
266
+ api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
267
+ except Exception as e:
268
+ yield f"⚠️ Repo creation note: {e}\n"
269
+
270
+ api.upload_folder(
271
+ folder_path=save_dir,
272
+ repo_id=REPO_ID,
273
+ repo_type="model",
274
+ commit_message=f"LoRA-trained OLMo 2 1B (r={LORA_R}, {NUM_EPOCHS} epochs)",
275
+ )
276
+ yield f"βœ… Model pushed to https://huggingface.co/{REPO_ID}\n"
277
+ yield "\nπŸŽ‰ All done! The trained model is now available on HuggingFace.\n"
278
  except Exception as e:
279
+ yield f"❌ Push failed: {e}\n"
280
+ yield traceback.format_exc() + "\n"
281
+ raise
 
 
 
 
 
 
 
282
 
283
 
284
  # ---------------------------------------------------------------------------