henribonamy commited on
Commit
59c4e64
·
verified ·
1 Parent(s): 3228c8a

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +16 -3
app.py CHANGED
@@ -60,8 +60,8 @@ def upload_to_hub(local_path: str, repo_path: str) -> None:
60
  log(f"Upload failed for {repo_path}: {e}")
61
 
62
 
63
- def run_training() -> None:
64
- """Run the full training pipeline."""
65
  global training_status
66
  training_status = "LOADING DATA"
67
 
@@ -72,7 +72,9 @@ def run_training() -> None:
72
  log(f"Device: {device}")
73
  if device.type == "cuda":
74
  log(f"GPU: {torch.cuda.get_device_name(0)}")
75
- log(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
 
 
76
 
77
  log("=" * 60)
78
  log("PHASE 1: Loading and preparing datasets")
@@ -185,6 +187,17 @@ def run_training() -> None:
185
  log("=" * 60)
186
 
187
 
 
 
 
 
 
 
 
 
 
 
 
188
  def get_logs() -> str:
189
  """Return current training logs."""
190
  header = f"Status: {training_status}\n{'=' * 60}\n"
 
60
  log(f"Upload failed for {repo_path}: {e}")
61
 
62
 
63
+ def _run_training_inner() -> None:
64
+ """Run the full training pipeline (inner function)."""
65
  global training_status
66
  training_status = "LOADING DATA"
67
 
 
72
  log(f"Device: {device}")
73
  if device.type == "cuda":
74
  log(f"GPU: {torch.cuda.get_device_name(0)}")
75
+ props = torch.cuda.get_device_properties(0)
76
+ vram = getattr(props, "total_memory", getattr(props, "total_mem", 0))
77
+ log(f"VRAM: {vram / 1e9:.1f} GB")
78
 
79
  log("=" * 60)
80
  log("PHASE 1: Loading and preparing datasets")
 
187
  log("=" * 60)
188
 
189
 
190
+ def run_training() -> None:
191
+ """Wrapper that catches all exceptions from the training pipeline."""
192
+ global training_status
193
+ try:
194
+ _run_training_inner()
195
+ except Exception as e:
196
+ training_status = f"CRASHED: {e}"
197
+ log(f"FATAL ERROR: {e}")
198
+ log(traceback.format_exc())
199
+
200
+
201
  def get_logs() -> str:
202
  """Return current training logs."""
203
  header = f"Status: {training_status}\n{'=' * 60}\n"