NorthernTribe-Research commited on
Commit
a68d3ef
·
verified ·
1 Parent(s): ce9c78b

Enable CPU fallback training path and preserve live training-loss graph telemetry.

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +36 -3
  3. scripts/train_sota.py +19 -7
README.md CHANGED
@@ -36,7 +36,7 @@ Credentials and publish permissions are handled by deployment runtime settings.
36
  - `Run Evaluation After Training`: toggles post-train eval in runtime config.
37
  - `Enforce Quality Gate`: enables/disables promotion gate checks.
38
  - `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
39
- - `Live Tactical Telemetry`: real-time stage progression, runtime posture, loss sparkline, and gate/push state.
40
  - `Ops Console (Live Log + Mission JSON)`: unified panel for line-by-line runtime stream, heartbeats, and structured mission summary.
41
  - `Validation Mode (No Training)`: validates pipeline with `--dry-run`.
42
  - `Force Dataset Redownload`: bypasses cached parquet files.
@@ -52,5 +52,5 @@ Credentials and publish permissions are handled by deployment runtime settings.
52
 
53
  ## Notes
54
 
55
- - Full training requires GPU hardware.
56
  - App handles Gradio copy-button compatibility across versions automatically.
 
36
  - `Run Evaluation After Training`: toggles post-train eval in runtime config.
37
  - `Enforce Quality Gate`: enables/disables promotion gate checks.
38
  - `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
39
+ - `Live Tactical Telemetry`: real-time stage progression, runtime posture, and training-loss graph (sparkline) with gate/push state.
40
  - `Ops Console (Live Log + Mission JSON)`: unified panel for line-by-line runtime stream, heartbeats, and structured mission summary.
41
  - `Validation Mode (No Training)`: validates pipeline with `--dry-run`.
42
  - `Force Dataset Redownload`: bypasses cached parquet files.
 
52
 
53
  ## Notes
54
 
55
+ - Full training runs on GPU when available and automatically falls back to CPU mode when CUDA is unavailable.
56
  - App handles Gradio copy-button compatibility across versions automatically.
app.py CHANGED
@@ -456,6 +456,30 @@ def _extract_loss_values(log_text: str, limit: int = 48) -> List[float]:
456
  return losses
457
 
458
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  def _build_loss_sparkline(losses: List[float]) -> str:
460
  if not losses:
461
  return "<div class='ops-v-small'>No live loss points yet.</div>"
@@ -526,7 +550,7 @@ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str)
526
 
527
  run_label = html.escape(str(safe_summary.get("run_label") or "not-started"))
528
  status_value = html.escape(status_text or "Idle")
529
- runtime_mode = "GPU READY" if runtime.get("cuda_available") else "CPU / PRECHECK"
530
  runtime_mode = html.escape(runtime_mode)
531
  device_count = _safe_int(runtime.get("cuda_device_count"), 0)
532
 
@@ -548,6 +572,10 @@ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str)
548
  stage_hint = html.escape(stage_hint)
549
 
550
  losses = _extract_loss_values(log_text)
 
 
 
 
551
  sparkline_html = _build_loss_sparkline(losses)
552
 
553
  pass_k = _fmt_pct(evaluation.get("pass_at_k"))
@@ -1039,9 +1067,14 @@ def run_pipeline_core(
1039
  yield "\n".join(log_lines), "Validating environment", summary_text(summary)
1040
 
1041
  if not preflight_only and not torch.cuda.is_available():
1042
- raise RuntimeError(
1043
- "GPU is not available. Switch Space hardware to a GPU (for example, T4) before full training."
 
 
1044
  )
 
 
 
1045
 
1046
  effective_push_to_hub = bool(push_to_hub)
1047
  if effective_push_to_hub and not token:
 
456
  return losses
457
 
458
 
459
+ def _extract_summary_loss_values(summary: Dict[str, Any], limit: int = 24) -> List[float]:
460
+ losses: List[float] = []
461
+ training_summary = _as_dict(summary.get("training_summary"))
462
+ stages_ran = training_summary.get("stages_ran")
463
+ if not isinstance(stages_ran, list):
464
+ return losses
465
+ for stage in stages_ran:
466
+ if not isinstance(stage, dict):
467
+ continue
468
+ train_metrics = stage.get("train_metrics")
469
+ if not isinstance(train_metrics, dict):
470
+ continue
471
+ value = train_metrics.get("train_loss")
472
+ try:
473
+ loss = float(value)
474
+ except (TypeError, ValueError):
475
+ continue
476
+ if loss >= 0.0:
477
+ losses.append(loss)
478
+ if len(losses) > limit:
479
+ losses = losses[-limit:]
480
+ return losses
481
+
482
+
483
  def _build_loss_sparkline(losses: List[float]) -> str:
484
  if not losses:
485
  return "<div class='ops-v-small'>No live loss points yet.</div>"
 
550
 
551
  run_label = html.escape(str(safe_summary.get("run_label") or "not-started"))
552
  status_value = html.escape(status_text or "Idle")
553
+ runtime_mode = "GPU READY" if runtime.get("cuda_available") else "CPU FALLBACK"
554
  runtime_mode = html.escape(runtime_mode)
555
  device_count = _safe_int(runtime.get("cuda_device_count"), 0)
556
 
 
572
  stage_hint = html.escape(stage_hint)
573
 
574
  losses = _extract_loss_values(log_text)
575
+ if len(losses) < 2:
576
+ summary_losses = _extract_summary_loss_values(safe_summary)
577
+ if summary_losses:
578
+ losses = summary_losses
579
  sparkline_html = _build_loss_sparkline(losses)
580
 
581
  pass_k = _fmt_pct(evaluation.get("pass_at_k"))
 
1067
  yield "\n".join(log_lines), "Validating environment", summary_text(summary)
1068
 
1069
  if not preflight_only and not torch.cuda.is_available():
1070
+ summary["compute_mode"] = "cpu_fallback"
1071
+ append_log(
1072
+ log_lines,
1073
+ "GPU is unavailable. Continuing with CPU fallback mode; training will be slower.",
1074
  )
1075
+ yield "\n".join(log_lines), "CPU fallback active", summary_text(summary)
1076
+ elif torch.cuda.is_available():
1077
+ summary["compute_mode"] = "gpu"
1078
 
1079
  effective_push_to_hub = bool(push_to_hub)
1080
  if effective_push_to_hub and not token:
scripts/train_sota.py CHANGED
@@ -432,8 +432,12 @@ def build_model_and_tokenizer(model_cfg: Dict[str, Any], training_defaults: Dict
432
  if not base_model:
433
  raise ValueError("model.base_model is required.")
434
 
435
- use_bf16 = bool(model_cfg.get("use_bf16", True))
436
- dtype = torch.bfloat16 if use_bf16 else torch.float16
 
 
 
 
437
 
438
  tokenizer = build_tokenizer(model_cfg)
439
 
@@ -445,10 +449,11 @@ def build_model_and_tokenizer(model_cfg: Dict[str, Any], training_defaults: Dict
445
  if attn_impl:
446
  model_kwargs["attn_implementation"] = attn_impl
447
 
448
- load_in_4bit = bool(model_cfg.get("load_in_4bit", True))
 
 
 
449
  if load_in_4bit:
450
- if not torch.cuda.is_available():
451
- raise RuntimeError("4-bit loading requested but CUDA is not available.")
452
  model_kwargs["quantization_config"] = BitsAndBytesConfig(
453
  load_in_4bit=True,
454
  bnb_4bit_quant_type=as_text(model_cfg.get("bnb_4bit_quant_type")) or "nf4",
@@ -565,6 +570,9 @@ def build_training_args(
565
  has_eval_split: bool,
566
  ) -> TrainingArguments:
567
  output_dir.mkdir(parents=True, exist_ok=True)
 
 
 
568
  return TrainingArguments(
569
  output_dir=str(output_dir),
570
  num_train_epochs=as_float(training_cfg.get("num_train_epochs"), 1.0),
@@ -582,8 +590,8 @@ def build_training_args(
582
  save_total_limit=as_int(training_cfg.get("save_total_limit"), 3),
583
  dataloader_num_workers=as_int(training_cfg.get("dataloader_num_workers"), 0),
584
  seed=as_int(training_cfg.get("seed"), 17),
585
- bf16=use_bf16,
586
- fp16=not use_bf16,
587
  remove_unused_columns=False,
588
  report_to="none",
589
  evaluation_strategy="steps" if has_eval_split else "no",
@@ -860,6 +868,10 @@ def main() -> None:
860
  model = None
861
  else:
862
  model, tokenizer = build_model_and_tokenizer(cfg["model"], cfg.get("training_defaults", {}))
 
 
 
 
863
 
864
  data_cfg = cfg["data"]
865
  stage_reports: List[Dict[str, Any]] = []
 
432
  if not base_model:
433
  raise ValueError("model.base_model is required.")
434
 
435
+ use_cuda = torch.cuda.is_available()
436
+ requested_bf16 = bool(model_cfg.get("use_bf16", True))
437
+ if use_cuda:
438
+ dtype = torch.bfloat16 if requested_bf16 else torch.float16
439
+ else:
440
+ dtype = torch.float32
441
 
442
  tokenizer = build_tokenizer(model_cfg)
443
 
 
449
  if attn_impl:
450
  model_kwargs["attn_implementation"] = attn_impl
451
 
452
+ requested_load_in_4bit = bool(model_cfg.get("load_in_4bit", True))
453
+ load_in_4bit = requested_load_in_4bit and use_cuda
454
+ if requested_load_in_4bit and not load_in_4bit:
455
+ print("CUDA unavailable. Disabling 4-bit loading and using full-precision CPU fallback.")
456
  if load_in_4bit:
 
 
457
  model_kwargs["quantization_config"] = BitsAndBytesConfig(
458
  load_in_4bit=True,
459
  bnb_4bit_quant_type=as_text(model_cfg.get("bnb_4bit_quant_type")) or "nf4",
 
570
  has_eval_split: bool,
571
  ) -> TrainingArguments:
572
  output_dir.mkdir(parents=True, exist_ok=True)
573
+ use_cuda = torch.cuda.is_available()
574
+ bf16_runtime = bool(use_cuda and use_bf16)
575
+ fp16_runtime = bool(use_cuda and not bf16_runtime)
576
  return TrainingArguments(
577
  output_dir=str(output_dir),
578
  num_train_epochs=as_float(training_cfg.get("num_train_epochs"), 1.0),
 
590
  save_total_limit=as_int(training_cfg.get("save_total_limit"), 3),
591
  dataloader_num_workers=as_int(training_cfg.get("dataloader_num_workers"), 0),
592
  seed=as_int(training_cfg.get("seed"), 17),
593
+ bf16=bf16_runtime,
594
+ fp16=fp16_runtime,
595
  remove_unused_columns=False,
596
  report_to="none",
597
  evaluation_strategy="steps" if has_eval_split else "no",
 
868
  model = None
869
  else:
870
  model, tokenizer = build_model_and_tokenizer(cfg["model"], cfg.get("training_defaults", {}))
871
+ if torch.cuda.is_available():
872
+ print("Compute mode: GPU")
873
+ else:
874
+ print("Compute mode: CPU fallback (no CUDA detected)")
875
 
876
  data_cfg = cfg["data"]
877
  stage_reports: List[Dict[str, Any]] = []