Spaces:
Running
Running
Update app_gpu.py
Browse files- app_gpu.py +115 -104
app_gpu.py
CHANGED
|
@@ -139,113 +139,124 @@ import spaces
|
|
| 139 |
import torch
|
| 140 |
from huggingface_hub import create_repo, upload_folder
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
repo,
|
| 151 |
-
batch_size,
|
| 152 |
-
num_workers,
|
| 153 |
-
r,
|
| 154 |
-
a,
|
| 155 |
-
ep,
|
| 156 |
-
lr,
|
| 157 |
-
max_records
|
| 158 |
-
):
|
| 159 |
-
import torch
|
| 160 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
|
| 161 |
-
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
| 162 |
-
from datasets import load_dataset
|
| 163 |
-
import os
|
| 164 |
-
from huggingface_hub import HfApi, upload_folder
|
| 165 |
-
|
| 166 |
-
print(f"[INFO] Loading base model: {base_model}")
|
| 167 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
torch.cuda.empty_cache()
|
|
|
|
| 241 |
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
print(f"[✅] Model uploaded successfully to {repo}")
|
| 246 |
-
|
| 247 |
-
return f"✅ Training done and uploaded to {repo if repo else 'local directory only.'}"
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
|
| 251 |
def upload_adapter(local, repo_id):
|
|
@@ -282,7 +293,7 @@ def run_ui():
|
|
| 282 |
logs = gr.Textbox(label="Logs (streaming)", lines=25)
|
| 283 |
|
| 284 |
def launch(bm, ds, csv, sc, lc, out_dir, batch, num_w, r_, a_, ep_, lr_, max_rec, repo_):
|
| 285 |
-
gen =
|
| 286 |
bm, ds, csv, [sc, lc], out_dir,
|
| 287 |
epochs=int(ep_), lr=float(lr_), r=int(r_), alpha=int(a_),
|
| 288 |
batch_size=int(batch), num_workers=int(num_w),
|
|
|
|
| 139 |
import torch
|
| 140 |
from huggingface_hub import create_repo, upload_folder
|
| 141 |
|
| 142 |
+
|
| 143 |
+
@spaces.GPU(duration=110)
|
| 144 |
+
def train_lora_stream(base_model, dataset_src, csv_name, text_cols, output_dir,
|
| 145 |
+
epochs=1, lr=1e-4, r=8, alpha=16, batch_size=1, num_workers=0,
|
| 146 |
+
max_train_records=None, repo_id=None):
|
| 147 |
+
"""LoRA training loop with GPU for compute, CPU for upload, capped at 150 steps."""
|
| 148 |
+
|
| 149 |
+
# --- Device setup ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 151 |
+
gpu_name = torch.cuda.get_device_name(0) if device == "cuda" else "CPU"
|
| 152 |
+
print(f"[INFO] 🚀 Using device: {device.upper()} ({gpu_name})")
|
| 153 |
+
|
| 154 |
+
# Adjust precision / batch based on VRAM
|
| 155 |
+
if device == "cuda":
|
| 156 |
+
vram = torch.cuda.get_device_properties(0).total_memory / (1024**3)
|
| 157 |
+
print(f"[INFO] VRAM: {vram:.2f} GB")
|
| 158 |
+
dtype = torch.bfloat16 if "A100" in gpu_name or vram > 20 else torch.float16
|
| 159 |
+
if vram < 10:
|
| 160 |
+
batch_size = max(1, batch_size // 2)
|
| 161 |
+
print(f"[WARN] Low VRAM, using batch_size={batch_size}")
|
| 162 |
+
else:
|
| 163 |
+
dtype = torch.float32
|
| 164 |
+
|
| 165 |
+
# --- Model & tokenizer ---
|
| 166 |
+
accelerator = Accelerator()
|
| 167 |
+
pipe = load_pipeline_auto(base_model, dtype=dtype)
|
| 168 |
+
model_obj = pipe["model"]
|
| 169 |
+
tokenizer = pipe["tokenizer"]
|
| 170 |
+
|
| 171 |
+
model_obj.train()
|
| 172 |
+
target_modules = find_target_modules(model_obj)
|
| 173 |
+
lcfg = LoraConfig(r=r, lora_alpha=alpha, target_modules=target_modules, lora_dropout=0.0)
|
| 174 |
+
lora_module = get_peft_model(model_obj, lcfg)
|
| 175 |
+
|
| 176 |
+
# --- Dataset ---
|
| 177 |
+
dataset = MediaTextDataset(dataset_src, csv_name, text_columns=text_cols, max_records=max_train_records)
|
| 178 |
+
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
|
| 179 |
+
optimizer = torch.optim.AdamW(lora_module.parameters(), lr=lr)
|
| 180 |
+
lora_module, optimizer, loader = accelerator.prepare(lora_module, optimizer, loader)
|
| 181 |
+
|
| 182 |
+
# --- Limit steps to 150 ---
|
| 183 |
+
max_steps = 150
|
| 184 |
+
total_steps = min(max_steps, max(1, epochs * len(loader)))
|
| 185 |
+
step_counter = 0
|
| 186 |
+
logs = []
|
| 187 |
+
|
| 188 |
+
yield f"[INFO] Starting LoRA training on {gpu_name} (max {max_steps} steps)...\n", 0.0
|
| 189 |
+
|
| 190 |
+
# --- Training Loop ---
|
| 191 |
+
for ep in range(epochs):
|
| 192 |
+
yield f"[DEBUG] Epoch {ep+1}/{epochs}\n", step_counter / total_steps
|
| 193 |
+
for i, batch in enumerate(loader):
|
| 194 |
+
if step_counter >= max_steps:
|
| 195 |
+
break
|
| 196 |
+
|
| 197 |
+
ex = unwrap_batch(batch, text_cols[0], text_cols[1])
|
| 198 |
+
texts = ex.get("text", {})
|
| 199 |
+
short_text = str(texts.get(text_cols[0], "") or "")
|
| 200 |
+
long_text = str(texts.get(text_cols[1], "") or "")
|
| 201 |
+
|
| 202 |
+
enc = tokenizer(
|
| 203 |
+
short_text,
|
| 204 |
+
text_pair=long_text,
|
| 205 |
+
return_tensors="pt",
|
| 206 |
+
padding="max_length",
|
| 207 |
+
truncation=True,
|
| 208 |
+
max_length=512,
|
| 209 |
+
)
|
| 210 |
+
enc = {k: v.to(accelerator.device) for k, v in enc.items()}
|
| 211 |
+
enc["labels"] = enc["input_ids"].clone()
|
| 212 |
+
|
| 213 |
+
outputs = lora_module(**enc)
|
| 214 |
+
loss = getattr(outputs, "loss", None)
|
| 215 |
+
if loss is None:
|
| 216 |
+
logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
|
| 217 |
+
loss = torch.nn.functional.cross_entropy(
|
| 218 |
+
logits.view(-1, logits.size(-1)),
|
| 219 |
+
enc["labels"].view(-1),
|
| 220 |
+
ignore_index=tokenizer.pad_token_id
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
optimizer.zero_grad()
|
| 224 |
+
accelerator.backward(loss)
|
| 225 |
+
optimizer.step()
|
| 226 |
+
|
| 227 |
+
logs.append(f"[DEBUG] Step {step_counter}, Loss: {loss.item():.6f}")
|
| 228 |
+
step_counter += 1
|
| 229 |
+
yield "\n".join(logs[-10:]), step_counter / total_steps
|
| 230 |
+
|
| 231 |
+
if step_counter >= max_steps:
|
| 232 |
+
break
|
| 233 |
+
|
| 234 |
+
# --- Save LoRA ---
|
| 235 |
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
| 236 |
+
lora_module.save_pretrained(output_dir)
|
| 237 |
+
yield f"[INFO] ✅ LoRA saved to {output_dir}\n", 0.95
|
| 238 |
+
|
| 239 |
+
# --- Free GPU before upload ---
|
| 240 |
+
if torch.cuda.is_available():
|
| 241 |
+
yield "[INFO] Releasing GPU memory before upload...\n", 0.96
|
| 242 |
+
del lora_module
|
| 243 |
torch.cuda.empty_cache()
|
| 244 |
+
torch.cuda.synchronize()
|
| 245 |
|
| 246 |
+
# --- Upload to HF (CPU mode only) ---
|
| 247 |
+
repo_id = repo_id or os.environ.get("HF_UPLOAD_REPO")
|
| 248 |
+
token = os.environ.get("HF_TOKEN")
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
+
if repo_id and token:
|
| 251 |
+
yield f"[INFO] Uploading adapter to {repo_id} (CPU mode)...\n", 0.97
|
| 252 |
+
try:
|
| 253 |
+
create_repo(repo_id, repo_type="model", exist_ok=True, token=token)
|
| 254 |
+
upload_folder(folder_path=output_dir, repo_id=repo_id, repo_type="model", token=token)
|
| 255 |
+
yield f"[INFO] ✅ Uploaded successfully: https://huggingface.co/{repo_id}\n", 1.0
|
| 256 |
+
except Exception as e:
|
| 257 |
+
yield f"[ERROR] Upload failed: {e}\n", 1.0
|
| 258 |
+
else:
|
| 259 |
+
yield f"[INFO] Skipping upload — repo_id or token not provided.\n", 1.0
|
| 260 |
|
| 261 |
|
| 262 |
def upload_adapter(local, repo_id):
|
|
|
|
| 293 |
logs = gr.Textbox(label="Logs (streaming)", lines=25)
|
| 294 |
|
| 295 |
def launch(bm, ds, csv, sc, lc, out_dir, batch, num_w, r_, a_, ep_, lr_, max_rec, repo_):
|
| 296 |
+
gen = train_lora_stream(
|
| 297 |
bm, ds, csv, [sc, lc], out_dir,
|
| 298 |
epochs=int(ep_), lr=float(lr_), r=int(r_), alpha=int(a_),
|
| 299 |
batch_size=int(batch), num_workers=int(num_w),
|