Spaces:

NorthernTribe-Research
/

math_trainer

Running

App Files Files Community

NorthernTribe-Research commited on about 16 hours ago

Commit

a68d3ef

verified ·

1 Parent(s): ce9c78b

Enable CPU fallback training path and preserve live training-loss graph telemetry.

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +36 -3
scripts/train_sota.py +19 -7

README.md CHANGED Viewed

@@ -36,7 +36,7 @@ Credentials and publish permissions are handled by deployment runtime settings.
 - `Run Evaluation After Training`: toggles post-train eval in runtime config.
 - `Enforce Quality Gate`: enables/disables promotion gate checks.
 - `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
-- `Live Tactical Telemetry`: real-time stage progression, runtime posture, loss sparkline, and gate/push state.
 - `Ops Console (Live Log + Mission JSON)`: unified panel for line-by-line runtime stream, heartbeats, and structured mission summary.
 - `Validation Mode (No Training)`: validates pipeline with `--dry-run`.
 - `Force Dataset Redownload`: bypasses cached parquet files.
@@ -52,5 +52,5 @@ Credentials and publish permissions are handled by deployment runtime settings.
 ## Notes
-- Full training requires GPU hardware.
 - App handles Gradio copy-button compatibility across versions automatically.

 - `Run Evaluation After Training`: toggles post-train eval in runtime config.
 - `Enforce Quality Gate`: enables/disables promotion gate checks.
 - `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
+- `Live Tactical Telemetry`: real-time stage progression, runtime posture, and training-loss graph (sparkline) with gate/push state.
 - `Ops Console (Live Log + Mission JSON)`: unified panel for line-by-line runtime stream, heartbeats, and structured mission summary.
 - `Validation Mode (No Training)`: validates pipeline with `--dry-run`.
 - `Force Dataset Redownload`: bypasses cached parquet files.
 ## Notes
+- Full training runs on GPU when available and automatically falls back to CPU mode when CUDA is unavailable.
 - App handles Gradio copy-button compatibility across versions automatically.

app.py CHANGED Viewed

@@ -456,6 +456,30 @@ def _extract_loss_values(log_text: str, limit: int = 48) -> List[float]:
     return losses
 def _build_loss_sparkline(losses: List[float]) -> str:
     if not losses:
         return "<div class='ops-v-small'>No live loss points yet.</div>"
@@ -526,7 +550,7 @@ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str)
     run_label = html.escape(str(safe_summary.get("run_label") or "not-started"))
     status_value = html.escape(status_text or "Idle")
-    runtime_mode = "GPU READY" if runtime.get("cuda_available") else "CPU / PRECHECK"
     runtime_mode = html.escape(runtime_mode)
     device_count = _safe_int(runtime.get("cuda_device_count"), 0)
@@ -548,6 +572,10 @@ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str)
     stage_hint = html.escape(stage_hint)
     losses = _extract_loss_values(log_text)
     sparkline_html = _build_loss_sparkline(losses)
     pass_k = _fmt_pct(evaluation.get("pass_at_k"))
@@ -1039,9 +1067,14 @@ def run_pipeline_core(
         yield "\n".join(log_lines), "Validating environment", summary_text(summary)
         if not preflight_only and not torch.cuda.is_available():
-            raise RuntimeError(
-                "GPU is not available. Switch Space hardware to a GPU (for example, T4) before full training."
             )
         effective_push_to_hub = bool(push_to_hub)
         if effective_push_to_hub and not token:

     return losses
+def _extract_summary_loss_values(summary: Dict[str, Any], limit: int = 24) -> List[float]:
+    losses: List[float] = []
+    training_summary = _as_dict(summary.get("training_summary"))
+    stages_ran = training_summary.get("stages_ran")
+    if not isinstance(stages_ran, list):
+        return losses
+    for stage in stages_ran:
+        if not isinstance(stage, dict):
+            continue
+        train_metrics = stage.get("train_metrics")
+        if not isinstance(train_metrics, dict):
+            continue
+        value = train_metrics.get("train_loss")
+        try:
+            loss = float(value)
+        except (TypeError, ValueError):
+            continue
+        if loss >= 0.0:
+            losses.append(loss)
+    if len(losses) > limit:
+        losses = losses[-limit:]
+    return losses
 def _build_loss_sparkline(losses: List[float]) -> str:
     if not losses:
         return "<div class='ops-v-small'>No live loss points yet.</div>"
     run_label = html.escape(str(safe_summary.get("run_label") or "not-started"))
     status_value = html.escape(status_text or "Idle")
+    runtime_mode = "GPU READY" if runtime.get("cuda_available") else "CPU FALLBACK"
     runtime_mode = html.escape(runtime_mode)
     device_count = _safe_int(runtime.get("cuda_device_count"), 0)
     stage_hint = html.escape(stage_hint)
     losses = _extract_loss_values(log_text)
+    if len(losses) < 2:
+        summary_losses = _extract_summary_loss_values(safe_summary)
+        if summary_losses:
+            losses = summary_losses
     sparkline_html = _build_loss_sparkline(losses)
     pass_k = _fmt_pct(evaluation.get("pass_at_k"))
         yield "\n".join(log_lines), "Validating environment", summary_text(summary)
         if not preflight_only and not torch.cuda.is_available():
+            summary["compute_mode"] = "cpu_fallback"
+            append_log(
+                log_lines,
+                "GPU is unavailable. Continuing with CPU fallback mode; training will be slower.",
             )
+            yield "\n".join(log_lines), "CPU fallback active", summary_text(summary)
+        elif torch.cuda.is_available():
+            summary["compute_mode"] = "gpu"
         effective_push_to_hub = bool(push_to_hub)
         if effective_push_to_hub and not token:

scripts/train_sota.py CHANGED Viewed

@@ -432,8 +432,12 @@ def build_model_and_tokenizer(model_cfg: Dict[str, Any], training_defaults: Dict
     if not base_model:
         raise ValueError("model.base_model is required.")
-    use_bf16 = bool(model_cfg.get("use_bf16", True))
-    dtype = torch.bfloat16 if use_bf16 else torch.float16
     tokenizer = build_tokenizer(model_cfg)
@@ -445,10 +449,11 @@ def build_model_and_tokenizer(model_cfg: Dict[str, Any], training_defaults: Dict
     if attn_impl:
         model_kwargs["attn_implementation"] = attn_impl
-    load_in_4bit = bool(model_cfg.get("load_in_4bit", True))
     if load_in_4bit:
-        if not torch.cuda.is_available():
-            raise RuntimeError("4-bit loading requested but CUDA is not available.")
         model_kwargs["quantization_config"] = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_quant_type=as_text(model_cfg.get("bnb_4bit_quant_type")) or "nf4",
@@ -565,6 +570,9 @@ def build_training_args(
     has_eval_split: bool,
 ) -> TrainingArguments:
     output_dir.mkdir(parents=True, exist_ok=True)
     return TrainingArguments(
         output_dir=str(output_dir),
         num_train_epochs=as_float(training_cfg.get("num_train_epochs"), 1.0),
@@ -582,8 +590,8 @@ def build_training_args(
         save_total_limit=as_int(training_cfg.get("save_total_limit"), 3),
         dataloader_num_workers=as_int(training_cfg.get("dataloader_num_workers"), 0),
         seed=as_int(training_cfg.get("seed"), 17),
-        bf16=use_bf16,
-        fp16=not use_bf16,
         remove_unused_columns=False,
         report_to="none",
         evaluation_strategy="steps" if has_eval_split else "no",
@@ -860,6 +868,10 @@ def main() -> None:
         model = None
     else:
         model, tokenizer = build_model_and_tokenizer(cfg["model"], cfg.get("training_defaults", {}))
     data_cfg = cfg["data"]
     stage_reports: List[Dict[str, Any]] = []

     if not base_model:
         raise ValueError("model.base_model is required.")
+    use_cuda = torch.cuda.is_available()
+    requested_bf16 = bool(model_cfg.get("use_bf16", True))
+    if use_cuda:
+        dtype = torch.bfloat16 if requested_bf16 else torch.float16
+    else:
+        dtype = torch.float32
     tokenizer = build_tokenizer(model_cfg)
     if attn_impl:
         model_kwargs["attn_implementation"] = attn_impl
+    requested_load_in_4bit = bool(model_cfg.get("load_in_4bit", True))
+    load_in_4bit = requested_load_in_4bit and use_cuda
+    if requested_load_in_4bit and not load_in_4bit:
+        print("CUDA unavailable. Disabling 4-bit loading and using full-precision CPU fallback.")
     if load_in_4bit:
         model_kwargs["quantization_config"] = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_quant_type=as_text(model_cfg.get("bnb_4bit_quant_type")) or "nf4",
     has_eval_split: bool,
 ) -> TrainingArguments:
     output_dir.mkdir(parents=True, exist_ok=True)
+    use_cuda = torch.cuda.is_available()
+    bf16_runtime = bool(use_cuda and use_bf16)
+    fp16_runtime = bool(use_cuda and not bf16_runtime)
     return TrainingArguments(
         output_dir=str(output_dir),
         num_train_epochs=as_float(training_cfg.get("num_train_epochs"), 1.0),
         save_total_limit=as_int(training_cfg.get("save_total_limit"), 3),
         dataloader_num_workers=as_int(training_cfg.get("dataloader_num_workers"), 0),
         seed=as_int(training_cfg.get("seed"), 17),
+        bf16=bf16_runtime,
+        fp16=fp16_runtime,
         remove_unused_columns=False,
         report_to="none",
         evaluation_strategy="steps" if has_eval_split else "no",
         model = None
     else:
         model, tokenizer = build_model_and_tokenizer(cfg["model"], cfg.get("training_defaults", {}))
+        if torch.cuda.is_available():
+            print("Compute mode: GPU")
+        else:
+            print("Compute mode: CPU fallback (no CUDA detected)")
     data_cfg = cfg["data"]
     stage_reports: List[Dict[str, Any]] = []