Enable CPU fallback training path and preserve live training-loss graph telemetry.
Browse files- README.md +2 -2
- app.py +36 -3
- scripts/train_sota.py +19 -7
README.md
CHANGED
|
@@ -36,7 +36,7 @@ Credentials and publish permissions are handled by deployment runtime settings.
|
|
| 36 |
- `Run Evaluation After Training`: toggles post-train eval in runtime config.
|
| 37 |
- `Enforce Quality Gate`: enables/disables promotion gate checks.
|
| 38 |
- `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
|
| 39 |
-
- `Live Tactical Telemetry`: real-time stage progression, runtime posture, loss sparkline
|
| 40 |
- `Ops Console (Live Log + Mission JSON)`: unified panel for line-by-line runtime stream, heartbeats, and structured mission summary.
|
| 41 |
- `Validation Mode (No Training)`: validates pipeline with `--dry-run`.
|
| 42 |
- `Force Dataset Redownload`: bypasses cached parquet files.
|
|
@@ -52,5 +52,5 @@ Credentials and publish permissions are handled by deployment runtime settings.
|
|
| 52 |
|
| 53 |
## Notes
|
| 54 |
|
| 55 |
-
- Full training
|
| 56 |
- App handles Gradio copy-button compatibility across versions automatically.
|
|
|
|
| 36 |
- `Run Evaluation After Training`: toggles post-train eval in runtime config.
|
| 37 |
- `Enforce Quality Gate`: enables/disables promotion gate checks.
|
| 38 |
- `Gate Min pass@1`, `Gate Min pass@k`, `Gate Min Rows`: runtime gate thresholds.
|
| 39 |
+
- `Live Tactical Telemetry`: real-time stage progression, runtime posture, and training-loss graph (sparkline) with gate/push state.
|
| 40 |
- `Ops Console (Live Log + Mission JSON)`: unified panel for line-by-line runtime stream, heartbeats, and structured mission summary.
|
| 41 |
- `Validation Mode (No Training)`: validates pipeline with `--dry-run`.
|
| 42 |
- `Force Dataset Redownload`: bypasses cached parquet files.
|
|
|
|
| 52 |
|
| 53 |
## Notes
|
| 54 |
|
| 55 |
+
- Full training runs on GPU when available and automatically falls back to CPU mode when CUDA is unavailable.
|
| 56 |
- App handles Gradio copy-button compatibility across versions automatically.
|
app.py
CHANGED
|
@@ -456,6 +456,30 @@ def _extract_loss_values(log_text: str, limit: int = 48) -> List[float]:
|
|
| 456 |
return losses
|
| 457 |
|
| 458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
def _build_loss_sparkline(losses: List[float]) -> str:
|
| 460 |
if not losses:
|
| 461 |
return "<div class='ops-v-small'>No live loss points yet.</div>"
|
|
@@ -526,7 +550,7 @@ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str)
|
|
| 526 |
|
| 527 |
run_label = html.escape(str(safe_summary.get("run_label") or "not-started"))
|
| 528 |
status_value = html.escape(status_text or "Idle")
|
| 529 |
-
runtime_mode = "GPU READY" if runtime.get("cuda_available") else "CPU
|
| 530 |
runtime_mode = html.escape(runtime_mode)
|
| 531 |
device_count = _safe_int(runtime.get("cuda_device_count"), 0)
|
| 532 |
|
|
@@ -548,6 +572,10 @@ def render_ops_visual(summary: Dict[str, Any], status_text: str, log_text: str)
|
|
| 548 |
stage_hint = html.escape(stage_hint)
|
| 549 |
|
| 550 |
losses = _extract_loss_values(log_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
sparkline_html = _build_loss_sparkline(losses)
|
| 552 |
|
| 553 |
pass_k = _fmt_pct(evaluation.get("pass_at_k"))
|
|
@@ -1039,9 +1067,14 @@ def run_pipeline_core(
|
|
| 1039 |
yield "\n".join(log_lines), "Validating environment", summary_text(summary)
|
| 1040 |
|
| 1041 |
if not preflight_only and not torch.cuda.is_available():
|
| 1042 |
-
|
| 1043 |
-
|
|
|
|
|
|
|
| 1044 |
)
|
|
|
|
|
|
|
|
|
|
| 1045 |
|
| 1046 |
effective_push_to_hub = bool(push_to_hub)
|
| 1047 |
if effective_push_to_hub and not token:
|
|
|
|
| 456 |
return losses
|
| 457 |
|
| 458 |
|
| 459 |
+
def _extract_summary_loss_values(summary: Dict[str, Any], limit: int = 24) -> List[float]:
|
| 460 |
+
losses: List[float] = []
|
| 461 |
+
training_summary = _as_dict(summary.get("training_summary"))
|
| 462 |
+
stages_ran = training_summary.get("stages_ran")
|
| 463 |
+
if not isinstance(stages_ran, list):
|
| 464 |
+
return losses
|
| 465 |
+
for stage in stages_ran:
|
| 466 |
+
if not isinstance(stage, dict):
|
| 467 |
+
continue
|
| 468 |
+
train_metrics = stage.get("train_metrics")
|
| 469 |
+
if not isinstance(train_metrics, dict):
|
| 470 |
+
continue
|
| 471 |
+
value = train_metrics.get("train_loss")
|
| 472 |
+
try:
|
| 473 |
+
loss = float(value)
|
| 474 |
+
except (TypeError, ValueError):
|
| 475 |
+
continue
|
| 476 |
+
if loss >= 0.0:
|
| 477 |
+
losses.append(loss)
|
| 478 |
+
if len(losses) > limit:
|
| 479 |
+
losses = losses[-limit:]
|
| 480 |
+
return losses
|
| 481 |
+
|
| 482 |
+
|
| 483 |
def _build_loss_sparkline(losses: List[float]) -> str:
|
| 484 |
if not losses:
|
| 485 |
return "<div class='ops-v-small'>No live loss points yet.</div>"
|
|
|
|
| 550 |
|
| 551 |
run_label = html.escape(str(safe_summary.get("run_label") or "not-started"))
|
| 552 |
status_value = html.escape(status_text or "Idle")
|
| 553 |
+
runtime_mode = "GPU READY" if runtime.get("cuda_available") else "CPU FALLBACK"
|
| 554 |
runtime_mode = html.escape(runtime_mode)
|
| 555 |
device_count = _safe_int(runtime.get("cuda_device_count"), 0)
|
| 556 |
|
|
|
|
| 572 |
stage_hint = html.escape(stage_hint)
|
| 573 |
|
| 574 |
losses = _extract_loss_values(log_text)
|
| 575 |
+
if len(losses) < 2:
|
| 576 |
+
summary_losses = _extract_summary_loss_values(safe_summary)
|
| 577 |
+
if summary_losses:
|
| 578 |
+
losses = summary_losses
|
| 579 |
sparkline_html = _build_loss_sparkline(losses)
|
| 580 |
|
| 581 |
pass_k = _fmt_pct(evaluation.get("pass_at_k"))
|
|
|
|
| 1067 |
yield "\n".join(log_lines), "Validating environment", summary_text(summary)
|
| 1068 |
|
| 1069 |
if not preflight_only and not torch.cuda.is_available():
|
| 1070 |
+
summary["compute_mode"] = "cpu_fallback"
|
| 1071 |
+
append_log(
|
| 1072 |
+
log_lines,
|
| 1073 |
+
"GPU is unavailable. Continuing with CPU fallback mode; training will be slower.",
|
| 1074 |
)
|
| 1075 |
+
yield "\n".join(log_lines), "CPU fallback active", summary_text(summary)
|
| 1076 |
+
elif torch.cuda.is_available():
|
| 1077 |
+
summary["compute_mode"] = "gpu"
|
| 1078 |
|
| 1079 |
effective_push_to_hub = bool(push_to_hub)
|
| 1080 |
if effective_push_to_hub and not token:
|
scripts/train_sota.py
CHANGED
|
@@ -432,8 +432,12 @@ def build_model_and_tokenizer(model_cfg: Dict[str, Any], training_defaults: Dict
|
|
| 432 |
if not base_model:
|
| 433 |
raise ValueError("model.base_model is required.")
|
| 434 |
|
| 435 |
-
|
| 436 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
|
| 438 |
tokenizer = build_tokenizer(model_cfg)
|
| 439 |
|
|
@@ -445,10 +449,11 @@ def build_model_and_tokenizer(model_cfg: Dict[str, Any], training_defaults: Dict
|
|
| 445 |
if attn_impl:
|
| 446 |
model_kwargs["attn_implementation"] = attn_impl
|
| 447 |
|
| 448 |
-
|
|
|
|
|
|
|
|
|
|
| 449 |
if load_in_4bit:
|
| 450 |
-
if not torch.cuda.is_available():
|
| 451 |
-
raise RuntimeError("4-bit loading requested but CUDA is not available.")
|
| 452 |
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
| 453 |
load_in_4bit=True,
|
| 454 |
bnb_4bit_quant_type=as_text(model_cfg.get("bnb_4bit_quant_type")) or "nf4",
|
|
@@ -565,6 +570,9 @@ def build_training_args(
|
|
| 565 |
has_eval_split: bool,
|
| 566 |
) -> TrainingArguments:
|
| 567 |
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
| 568 |
return TrainingArguments(
|
| 569 |
output_dir=str(output_dir),
|
| 570 |
num_train_epochs=as_float(training_cfg.get("num_train_epochs"), 1.0),
|
|
@@ -582,8 +590,8 @@ def build_training_args(
|
|
| 582 |
save_total_limit=as_int(training_cfg.get("save_total_limit"), 3),
|
| 583 |
dataloader_num_workers=as_int(training_cfg.get("dataloader_num_workers"), 0),
|
| 584 |
seed=as_int(training_cfg.get("seed"), 17),
|
| 585 |
-
bf16=
|
| 586 |
-
fp16=
|
| 587 |
remove_unused_columns=False,
|
| 588 |
report_to="none",
|
| 589 |
evaluation_strategy="steps" if has_eval_split else "no",
|
|
@@ -860,6 +868,10 @@ def main() -> None:
|
|
| 860 |
model = None
|
| 861 |
else:
|
| 862 |
model, tokenizer = build_model_and_tokenizer(cfg["model"], cfg.get("training_defaults", {}))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 863 |
|
| 864 |
data_cfg = cfg["data"]
|
| 865 |
stage_reports: List[Dict[str, Any]] = []
|
|
|
|
| 432 |
if not base_model:
|
| 433 |
raise ValueError("model.base_model is required.")
|
| 434 |
|
| 435 |
+
use_cuda = torch.cuda.is_available()
|
| 436 |
+
requested_bf16 = bool(model_cfg.get("use_bf16", True))
|
| 437 |
+
if use_cuda:
|
| 438 |
+
dtype = torch.bfloat16 if requested_bf16 else torch.float16
|
| 439 |
+
else:
|
| 440 |
+
dtype = torch.float32
|
| 441 |
|
| 442 |
tokenizer = build_tokenizer(model_cfg)
|
| 443 |
|
|
|
|
| 449 |
if attn_impl:
|
| 450 |
model_kwargs["attn_implementation"] = attn_impl
|
| 451 |
|
| 452 |
+
requested_load_in_4bit = bool(model_cfg.get("load_in_4bit", True))
|
| 453 |
+
load_in_4bit = requested_load_in_4bit and use_cuda
|
| 454 |
+
if requested_load_in_4bit and not load_in_4bit:
|
| 455 |
+
print("CUDA unavailable. Disabling 4-bit loading and using full-precision CPU fallback.")
|
| 456 |
if load_in_4bit:
|
|
|
|
|
|
|
| 457 |
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
| 458 |
load_in_4bit=True,
|
| 459 |
bnb_4bit_quant_type=as_text(model_cfg.get("bnb_4bit_quant_type")) or "nf4",
|
|
|
|
| 570 |
has_eval_split: bool,
|
| 571 |
) -> TrainingArguments:
|
| 572 |
output_dir.mkdir(parents=True, exist_ok=True)
|
| 573 |
+
use_cuda = torch.cuda.is_available()
|
| 574 |
+
bf16_runtime = bool(use_cuda and use_bf16)
|
| 575 |
+
fp16_runtime = bool(use_cuda and not bf16_runtime)
|
| 576 |
return TrainingArguments(
|
| 577 |
output_dir=str(output_dir),
|
| 578 |
num_train_epochs=as_float(training_cfg.get("num_train_epochs"), 1.0),
|
|
|
|
| 590 |
save_total_limit=as_int(training_cfg.get("save_total_limit"), 3),
|
| 591 |
dataloader_num_workers=as_int(training_cfg.get("dataloader_num_workers"), 0),
|
| 592 |
seed=as_int(training_cfg.get("seed"), 17),
|
| 593 |
+
bf16=bf16_runtime,
|
| 594 |
+
fp16=fp16_runtime,
|
| 595 |
remove_unused_columns=False,
|
| 596 |
report_to="none",
|
| 597 |
evaluation_strategy="steps" if has_eval_split else "no",
|
|
|
|
| 868 |
model = None
|
| 869 |
else:
|
| 870 |
model, tokenizer = build_model_and_tokenizer(cfg["model"], cfg.get("training_defaults", {}))
|
| 871 |
+
if torch.cuda.is_available():
|
| 872 |
+
print("Compute mode: GPU")
|
| 873 |
+
else:
|
| 874 |
+
print("Compute mode: CPU fallback (no CUDA detected)")
|
| 875 |
|
| 876 |
data_cfg = cfg["data"]
|
| 877 |
stage_reports: List[Dict[str, Any]] = []
|