rahul7star commited on
Commit
caafa78
·
verified ·
1 Parent(s): 1e1e4d4

Update app_gpu.py

Browse files
Files changed (1) hide show
  1. app_gpu.py +115 -104
app_gpu.py CHANGED
@@ -139,113 +139,124 @@ import spaces
139
  import torch
140
  from huggingface_hub import create_repo, upload_folder
141
 
142
- @spaces.GPU(duration=100)
143
- def train_lora_model(
144
- base_model,
145
- dataset,
146
- csvname,
147
- short_col,
148
- long_col,
149
- out,
150
- repo,
151
- batch_size,
152
- num_workers,
153
- r,
154
- a,
155
- ep,
156
- lr,
157
- max_records
158
- ):
159
- import torch
160
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
161
- from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
162
- from datasets import load_dataset
163
- import os
164
- from huggingface_hub import HfApi, upload_folder
165
-
166
- print(f"[INFO] Loading base model: {base_model}")
167
  device = "cuda" if torch.cuda.is_available() else "cpu"
168
- model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32)
169
- tokenizer = AutoTokenizer.from_pretrained(base_model)
170
- tokenizer.pad_token = tokenizer.eos_token
171
-
172
- # LoRA setup
173
- print("[INFO] Setting up LoRA configuration...")
174
- lora_config = LoraConfig(
175
- r=r,
176
- lora_alpha=a,
177
- target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
178
- lora_dropout=0.05,
179
- bias="none",
180
- task_type="CAUSAL_LM"
181
- )
182
- model = prepare_model_for_kbit_training(model)
183
- model = get_peft_model(model, lora_config)
184
- model.print_trainable_parameters()
185
-
186
- print(f"[INFO] Loading dataset from: {dataset}")
187
- ds = load_dataset(dataset)
188
- df = ds["train"].to_pandas()
189
- print(f"[DEBUG] Loaded dataset: {dataset}, columns: {df.columns.tolist()}")
190
- print("[DEBUG] Sample rows:\n", df.head(3))
191
-
192
- df = df[[short_col, long_col]].dropna().head(max_records)
193
- train_data = list(zip(df[short_col], df[long_col]))
194
-
195
- print(f"[INFO] Tokenizing {len(train_data)} records...")
196
- def tokenize(examples):
197
- inputs = [f"Short: {s}\nLong: {l}" for s, l in examples]
198
- model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
199
- model_inputs["labels"] = model_inputs["input_ids"].copy()
200
- return model_inputs
201
-
202
- tokenized_data = [tokenize([f"{s}\n{l}"]) for s, l in train_data]
203
- print(f"[INFO] Tokenized {len(tokenized_data)} samples")
204
-
205
- # Trainer setup
206
- training_args = TrainingArguments(
207
- output_dir=out,
208
- num_train_epochs=ep,
209
- per_device_train_batch_size=batch_size,
210
- learning_rate=lr,
211
- logging_dir=os.path.join(out, "logs"),
212
- logging_steps=10,
213
- save_strategy="no",
214
- report_to="none",
215
- dataloader_num_workers=num_workers,
216
- max_steps=200, # Limit steps to 200 to avoid timeout
217
- )
218
-
219
- print("[INFO] Starting training loop (max 200 steps)...")
220
- dummy_dataset = [{"input_ids": torch.tensor(d["input_ids"]), "labels": torch.tensor(d["labels"])} for d in tokenized_data]
221
-
222
- trainer = Trainer(
223
- model=model,
224
- args=training_args,
225
- train_dataset=dummy_dataset
226
- )
227
-
228
- trainer.train()
229
- print("[✅] Training completed!")
230
-
231
- # Save adapter and tokenizer locally first
232
- model.save_pretrained(out)
233
- tokenizer.save_pretrained(out)
234
- print(f"[INFO] Model saved locally at: {out}")
235
-
236
- # ✅ Upload happens in CPU mode only (after freeing GPU memory)
237
- if repo:
238
- print("[INFO] Switching to CPU for model upload...")
239
- del model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  torch.cuda.empty_cache()
 
241
 
242
- api = HfApi()
243
- print(f"[INFO] Uploading model to Hugging Face repo: {repo}")
244
- upload_folder(folder_path=out, repo_id=repo, repo_type="model", commit_message="Upload trained LoRA adapter")
245
- print(f"[✅] Model uploaded successfully to {repo}")
246
-
247
- return f"✅ Training done and uploaded to {repo if repo else 'local directory only.'}"
248
 
 
 
 
 
 
 
 
 
 
 
249
 
250
 
251
  def upload_adapter(local, repo_id):
@@ -282,7 +293,7 @@ def run_ui():
282
  logs = gr.Textbox(label="Logs (streaming)", lines=25)
283
 
284
  def launch(bm, ds, csv, sc, lc, out_dir, batch, num_w, r_, a_, ep_, lr_, max_rec, repo_):
285
- gen = train_lora_model(
286
  bm, ds, csv, [sc, lc], out_dir,
287
  epochs=int(ep_), lr=float(lr_), r=int(r_), alpha=int(a_),
288
  batch_size=int(batch), num_workers=int(num_w),
 
139
  import torch
140
  from huggingface_hub import create_repo, upload_folder
141
 
142
+
143
+ @spaces.GPU(duration=110)
144
+ def train_lora_stream(base_model, dataset_src, csv_name, text_cols, output_dir,
145
+ epochs=1, lr=1e-4, r=8, alpha=16, batch_size=1, num_workers=0,
146
+ max_train_records=None, repo_id=None):
147
+ """LoRA training loop with GPU for compute, CPU for upload, capped at 150 steps."""
148
+
149
+ # --- Device setup ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  device = "cuda" if torch.cuda.is_available() else "cpu"
151
+ gpu_name = torch.cuda.get_device_name(0) if device == "cuda" else "CPU"
152
+ print(f"[INFO] 🚀 Using device: {device.upper()} ({gpu_name})")
153
+
154
+ # Adjust precision / batch based on VRAM
155
+ if device == "cuda":
156
+ vram = torch.cuda.get_device_properties(0).total_memory / (1024**3)
157
+ print(f"[INFO] VRAM: {vram:.2f} GB")
158
+ dtype = torch.bfloat16 if "A100" in gpu_name or vram > 20 else torch.float16
159
+ if vram < 10:
160
+ batch_size = max(1, batch_size // 2)
161
+ print(f"[WARN] Low VRAM, using batch_size={batch_size}")
162
+ else:
163
+ dtype = torch.float32
164
+
165
+ # --- Model & tokenizer ---
166
+ accelerator = Accelerator()
167
+ pipe = load_pipeline_auto(base_model, dtype=dtype)
168
+ model_obj = pipe["model"]
169
+ tokenizer = pipe["tokenizer"]
170
+
171
+ model_obj.train()
172
+ target_modules = find_target_modules(model_obj)
173
+ lcfg = LoraConfig(r=r, lora_alpha=alpha, target_modules=target_modules, lora_dropout=0.0)
174
+ lora_module = get_peft_model(model_obj, lcfg)
175
+
176
+ # --- Dataset ---
177
+ dataset = MediaTextDataset(dataset_src, csv_name, text_columns=text_cols, max_records=max_train_records)
178
+ loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
179
+ optimizer = torch.optim.AdamW(lora_module.parameters(), lr=lr)
180
+ lora_module, optimizer, loader = accelerator.prepare(lora_module, optimizer, loader)
181
+
182
+ # --- Limit steps to 150 ---
183
+ max_steps = 150
184
+ total_steps = min(max_steps, max(1, epochs * len(loader)))
185
+ step_counter = 0
186
+ logs = []
187
+
188
+ yield f"[INFO] Starting LoRA training on {gpu_name} (max {max_steps} steps)...\n", 0.0
189
+
190
+ # --- Training Loop ---
191
+ for ep in range(epochs):
192
+ yield f"[DEBUG] Epoch {ep+1}/{epochs}\n", step_counter / total_steps
193
+ for i, batch in enumerate(loader):
194
+ if step_counter >= max_steps:
195
+ break
196
+
197
+ ex = unwrap_batch(batch, text_cols[0], text_cols[1])
198
+ texts = ex.get("text", {})
199
+ short_text = str(texts.get(text_cols[0], "") or "")
200
+ long_text = str(texts.get(text_cols[1], "") or "")
201
+
202
+ enc = tokenizer(
203
+ short_text,
204
+ text_pair=long_text,
205
+ return_tensors="pt",
206
+ padding="max_length",
207
+ truncation=True,
208
+ max_length=512,
209
+ )
210
+ enc = {k: v.to(accelerator.device) for k, v in enc.items()}
211
+ enc["labels"] = enc["input_ids"].clone()
212
+
213
+ outputs = lora_module(**enc)
214
+ loss = getattr(outputs, "loss", None)
215
+ if loss is None:
216
+ logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
217
+ loss = torch.nn.functional.cross_entropy(
218
+ logits.view(-1, logits.size(-1)),
219
+ enc["labels"].view(-1),
220
+ ignore_index=tokenizer.pad_token_id
221
+ )
222
+
223
+ optimizer.zero_grad()
224
+ accelerator.backward(loss)
225
+ optimizer.step()
226
+
227
+ logs.append(f"[DEBUG] Step {step_counter}, Loss: {loss.item():.6f}")
228
+ step_counter += 1
229
+ yield "\n".join(logs[-10:]), step_counter / total_steps
230
+
231
+ if step_counter >= max_steps:
232
+ break
233
+
234
+ # --- Save LoRA ---
235
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
236
+ lora_module.save_pretrained(output_dir)
237
+ yield f"[INFO] ✅ LoRA saved to {output_dir}\n", 0.95
238
+
239
+ # --- Free GPU before upload ---
240
+ if torch.cuda.is_available():
241
+ yield "[INFO] Releasing GPU memory before upload...\n", 0.96
242
+ del lora_module
243
  torch.cuda.empty_cache()
244
+ torch.cuda.synchronize()
245
 
246
+ # --- Upload to HF (CPU mode only) ---
247
+ repo_id = repo_id or os.environ.get("HF_UPLOAD_REPO")
248
+ token = os.environ.get("HF_TOKEN")
 
 
 
249
 
250
+ if repo_id and token:
251
+ yield f"[INFO] Uploading adapter to {repo_id} (CPU mode)...\n", 0.97
252
+ try:
253
+ create_repo(repo_id, repo_type="model", exist_ok=True, token=token)
254
+ upload_folder(folder_path=output_dir, repo_id=repo_id, repo_type="model", token=token)
255
+ yield f"[INFO] ✅ Uploaded successfully: https://huggingface.co/{repo_id}\n", 1.0
256
+ except Exception as e:
257
+ yield f"[ERROR] Upload failed: {e}\n", 1.0
258
+ else:
259
+ yield f"[INFO] Skipping upload — repo_id or token not provided.\n", 1.0
260
 
261
 
262
  def upload_adapter(local, repo_id):
 
293
  logs = gr.Textbox(label="Logs (streaming)", lines=25)
294
 
295
  def launch(bm, ds, csv, sc, lc, out_dir, batch, num_w, r_, a_, ep_, lr_, max_rec, repo_):
296
+ gen = train_lora_stream(
297
  bm, ds, csv, [sc, lc], out_dir,
298
  epochs=int(ep_), lr=float(lr_), r=int(r_), alpha=int(a_),
299
  batch_size=int(batch), num_workers=int(num_w),