Harshit200431 commited on
Commit
ed29027
Β·
2 Parent(s): e03ae4e939dba8

Added GPU UI hardcoded

Browse files
app.py CHANGED
@@ -17,7 +17,7 @@ from fastapi.staticfiles import StaticFiles
17
  from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, StreamingResponse
18
  from pydantic import BaseModel
19
 
20
- from cluster_trust_env import ClusterTrustEnv
21
  from difficulty_controller import GLOBAL_DIFFICULTY_CONTROLLER
22
  from environment import SentinelEnv
23
  from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
@@ -130,6 +130,19 @@ def _get_env(session_id: str) -> SentinelEnv | ClusterTrustEnv:
130
  return env
131
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def _resolve_env_mode(task_type: str | None, mode: str | None = None) -> tuple[str, str]:
134
  requested_task = task_type or "task3"
135
  requested_mode = (mode or "").lower()
@@ -218,6 +231,27 @@ class StepRequest(BaseModel):
218
  reasoning: str | None = None
219
 
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  # ---------------------------------------------------------------------------
222
  # Endpoints
223
  # ---------------------------------------------------------------------------
@@ -255,6 +289,11 @@ def root():
255
  "/grader", "/reward-report", "/difficulty", "/stream", "/trust-dashboard",
256
  "/cluster-dashboard",
257
  "/reset", "/step", "/state",
 
 
 
 
 
258
  ],
259
  }
260
  )
@@ -308,6 +347,11 @@ def api_root():
308
  "/grader", "/reward-report", "/difficulty", "/stream", "/trust-dashboard",
309
  "/cluster-dashboard",
310
  "/reset", "/step", "/state",
 
 
 
 
 
311
  ],
312
  }
313
 
@@ -369,8 +413,13 @@ def metadata():
369
  },
370
  "adaptive_curriculum": GLOBAL_DIFFICULTY_CONTROLLER.state(),
371
  "cluster_mode": {
372
- "how_to_enable": "POST /reset with {\"mode\":\"cluster\",\"task_type\":\"task3\"} or {\"task_type\":\"cluster_task3\"}.",
 
 
 
 
373
  "live_dashboard": "/cluster-dashboard?session_id=<session_id>",
 
374
  },
375
  }
376
 
@@ -578,6 +627,228 @@ def mcp(body: dict[str, Any]):
578
  raise HTTPException(status_code=400, detail=f"Unknown method: {method}")
579
 
580
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
  def _trust_dashboard_html(session_id: str) -> str:
582
  escaped_session = html.escape(session_id, quote=True)
583
  return f"""<!doctype html>
 
17
  from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, StreamingResponse
18
  from pydantic import BaseModel
19
 
20
+ from cluster_trust_env import CLUSTER_TASK_CONFIG, ClusterTrustEnv
21
  from difficulty_controller import GLOBAL_DIFFICULTY_CONTROLLER
22
  from environment import SentinelEnv
23
  from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
 
130
  return env
131
 
132
 
133
+ def _get_cluster_env(session_id: str) -> ClusterTrustEnv:
134
+ env = _get_env(session_id)
135
+ if not isinstance(env, ClusterTrustEnv):
136
+ raise HTTPException(
137
+ status_code=400,
138
+ detail=(
139
+ "Session is in abstract SentinelEnv mode. Start a cluster session via "
140
+ "POST /cluster/reset (or POST /reset with mode='cluster')."
141
+ ),
142
+ )
143
+ return env
144
+
145
+
146
  def _resolve_env_mode(task_type: str | None, mode: str | None = None) -> tuple[str, str]:
147
  requested_task = task_type or "task3"
148
  requested_mode = (mode or "").lower()
 
231
  reasoning: str | None = None
232
 
233
 
234
+ # Cluster-only request shapes. Kept separate from ResetRequest/StepRequest so
235
+ # the OpenAPI schema makes the GPU-cluster contract explicit.
236
+
237
+ CLUSTER_ACTION_TYPES = ("allocate", "preempt", "request_info", "verify", "tick")
238
+
239
+
240
+ class ClusterResetRequest(BaseModel):
241
+ task_type: str | None = None # "task1" | "task2" | "task3" (also accepts "cluster_task*")
242
+ seed: int | None = None
243
+ adaptive: bool = False
244
+
245
+
246
+ class ClusterStepRequest(BaseModel):
247
+ action_type: str # allocate | preempt | request_info | verify | tick
248
+ job_id: str | None = None
249
+ gpu_id: str | None = None
250
+ worker_id: str | None = None
251
+ force_flag: bool | None = None
252
+ reasoning: str | None = None
253
+
254
+
255
  # ---------------------------------------------------------------------------
256
  # Endpoints
257
  # ---------------------------------------------------------------------------
 
289
  "/grader", "/reward-report", "/difficulty", "/stream", "/trust-dashboard",
290
  "/cluster-dashboard",
291
  "/reset", "/step", "/state",
292
+ "/cluster", "/cluster/metadata", "/cluster/tasks",
293
+ "/cluster/reset", "/cluster/step", "/cluster/state",
294
+ "/cluster/gpus", "/cluster/jobs", "/cluster/workers",
295
+ "/cluster/audit", "/cluster/audit/investigate",
296
+ "/cluster/ai-failure-coverage", "/cluster/reward-report", "/cluster/stream",
297
  ],
298
  }
299
  )
 
347
  "/grader", "/reward-report", "/difficulty", "/stream", "/trust-dashboard",
348
  "/cluster-dashboard",
349
  "/reset", "/step", "/state",
350
+ "/cluster", "/cluster/metadata", "/cluster/tasks",
351
+ "/cluster/reset", "/cluster/step", "/cluster/state",
352
+ "/cluster/gpus", "/cluster/jobs", "/cluster/workers",
353
+ "/cluster/audit", "/cluster/audit/investigate",
354
+ "/cluster/ai-failure-coverage", "/cluster/reward-report", "/cluster/stream",
355
  ],
356
  }
357
 
 
413
  },
414
  "adaptive_curriculum": GLOBAL_DIFFICULTY_CONTROLLER.state(),
415
  "cluster_mode": {
416
+ "how_to_enable": (
417
+ "POST /cluster/reset with {\"task_type\":\"task3\"} (preferred), "
418
+ "or POST /reset with {\"mode\":\"cluster\",\"task_type\":\"task3\"} "
419
+ "or {\"task_type\":\"cluster_task3\"}."
420
+ ),
421
  "live_dashboard": "/cluster-dashboard?session_id=<session_id>",
422
+ "api_root": "/cluster",
423
  },
424
  }
425
 
 
627
  raise HTTPException(status_code=400, detail=f"Unknown method: {method}")
628
 
629
 
630
+ # ---------------------------------------------------------------------------
631
+ # Cluster API (GPU cluster trust mission, namespaced under /cluster/*)
632
+ # ---------------------------------------------------------------------------
633
+
634
+
635
+ def _cluster_task_type(raw: str | None) -> str:
636
+ task_type = (raw or "task3").removeprefix("cluster_")
637
+ if task_type not in CLUSTER_TASK_CONFIG:
638
+ raise HTTPException(
639
+ status_code=400,
640
+ detail=(
641
+ f"Unknown cluster task_type '{raw}'. "
642
+ f"Expected one of: {', '.join(sorted(CLUSTER_TASK_CONFIG))}."
643
+ ),
644
+ )
645
+ return task_type
646
+
647
+
648
+ @app.get("/cluster")
649
+ def cluster_root():
650
+ return {
651
+ "name": "sentinel-cluster",
652
+ "summary": (
653
+ "GPU cluster trust calibration API. The orchestrator schedules jobs across "
654
+ "GPUs, audits worker reports, and routes around adversarial false completions "
655
+ "while keeping cluster health and AI reliability high."
656
+ ),
657
+ "session_lifecycle": [
658
+ "POST /cluster/reset -> {info.session_id}",
659
+ "POST /cluster/step?session_id=...",
660
+ "GET /cluster/state?session_id=... (or /cluster/stream for SSE)",
661
+ ],
662
+ "routes": [
663
+ "POST /cluster/reset",
664
+ "POST /cluster/step",
665
+ "GET /cluster/state",
666
+ "GET /cluster/gpus",
667
+ "GET /cluster/jobs",
668
+ "GET /cluster/workers",
669
+ "GET /cluster/audit",
670
+ "GET /cluster/audit/investigate",
671
+ "GET /cluster/ai-failure-coverage",
672
+ "GET /cluster/reward-report",
673
+ "GET /cluster/stream",
674
+ "GET /cluster/metadata",
675
+ "GET /cluster/tasks",
676
+ "GET /cluster-dashboard",
677
+ ],
678
+ }
679
+
680
+
681
+ @app.get("/cluster/metadata")
682
+ def cluster_metadata():
683
+ return {
684
+ "tasks": {
685
+ "task1": {**CLUSTER_TASK_CONFIG["task1"], "name": "Cluster Basics"},
686
+ "task2": {**CLUSTER_TASK_CONFIG["task2"], "name": "Unreliable Workers"},
687
+ "task3": {**CLUSTER_TASK_CONFIG["task3"], "name": "Full Adversarial Cluster"},
688
+ },
689
+ "action_types": {
690
+ "allocate": {"description": "Place a queued job on a GPU and assign a worker.",
691
+ "fields": ["job_id?", "gpu_id?", "worker_id?"]},
692
+ "preempt": {"description": "Free a running job from its GPU.",
693
+ "fields": ["job_id?"]},
694
+ "request_info": {"description": "Ask the assigned worker for a fresh progress report.",
695
+ "fields": ["job_id?", "worker_id?"]},
696
+ "verify": {"description": "Audit a worker's report. Catches false completions and lying.",
697
+ "fields": ["job_id?", "worker_id?", "force_flag?"]},
698
+ "tick": {"description": "Advance the cluster clock without acting.",
699
+ "fields": []},
700
+ },
701
+ "workers": list(["S0", "S1", "S2", "S3", "S4"]),
702
+ "scoring": "global_reward = weighted(orchestrator, resource_manager, auditor, worker) Γ— cluster_health Γ— ai_reliability_modifier",
703
+ "terminal": "task1: jobs+util | task2: jobs+calibration+deadlines | task3: jobs+detection+plan_coherence+efficiency",
704
+ "controller": GLOBAL_DIFFICULTY_CONTROLLER.state(),
705
+ }
706
+
707
+
708
+ @app.get("/cluster/tasks")
709
+ def cluster_tasks():
710
+ descriptions = {
711
+ "task1": "10-job warmup. No adversary, no GPU failures. Learn the allocate/preempt/tick loop.",
712
+ "task2": "20-job stream with unreliable/slow/degrading workers and rare GPU failures.",
713
+ "task3": "30-job adversarial cluster: false memory reports, false completions, poisoned reward claims.",
714
+ }
715
+ out: dict[str, Any] = {}
716
+ for tid, cfg in CLUSTER_TASK_CONFIG.items():
717
+ out[tid] = {
718
+ "difficulty": {"task1": "easy", "task2": "medium", "task3": "hard"}[tid],
719
+ "description": descriptions[tid],
720
+ "adversary_active": cfg["adversary"],
721
+ "jobs": cfg["jobs"],
722
+ "gpus": cfg["gpus"],
723
+ "max_steps": cfg["max_steps"],
724
+ "failure_probability": cfg["failure_probability"],
725
+ }
726
+ return out
727
+
728
+
729
+ @app.post("/cluster/reset")
730
+ def cluster_reset(req: ClusterResetRequest = ClusterResetRequest()):
731
+ task_type = _cluster_task_type(req.task_type)
732
+ env = ClusterTrustEnv()
733
+ result = env.reset(task_type=task_type, seed=req.seed, adaptive=req.adaptive)
734
+ session_id = result["info"]["session_id"]
735
+ _sessions.set(session_id, env)
736
+ return _add_demo_context(result, env)
737
+
738
+
739
+ @app.post("/cluster/step")
740
+ def cluster_step(req: ClusterStepRequest, session_id: str = Query(...)):
741
+ if req.action_type not in CLUSTER_ACTION_TYPES:
742
+ raise HTTPException(
743
+ status_code=400,
744
+ detail=f"Unknown cluster action_type '{req.action_type}'. Expected one of: {', '.join(CLUSTER_ACTION_TYPES)}.",
745
+ )
746
+ env = _get_cluster_env(session_id)
747
+ try:
748
+ result = env.step(req.model_dump(exclude_none=True))
749
+ except (RuntimeError, ValueError) as exc:
750
+ raise HTTPException(status_code=400, detail=str(exc))
751
+
752
+ if result["done"]:
753
+ _sessions.pop(session_id)
754
+ else:
755
+ _add_demo_context(result, env)
756
+ return result
757
+
758
+
759
+ @app.get("/cluster/state")
760
+ def cluster_state(session_id: str = Query(...)):
761
+ env = _get_cluster_env(session_id)
762
+ return env.state()
763
+
764
+
765
+ @app.get("/cluster/gpus")
766
+ def cluster_gpus(session_id: str = Query(...), include_hidden: bool = Query(False)):
767
+ env = _get_cluster_env(session_id)
768
+ return {
769
+ "summary": env._pool.summary(),
770
+ "gpus": env._pool.snapshot(include_hidden=include_hidden),
771
+ }
772
+
773
+
774
+ @app.get("/cluster/jobs")
775
+ def cluster_jobs(
776
+ session_id: str = Query(...),
777
+ include_hidden: bool = Query(False),
778
+ deadline_window: int = Query(10, ge=1, le=240),
779
+ ):
780
+ env = _get_cluster_env(session_id)
781
+ return {
782
+ "summary": env._jobs.summary(),
783
+ "jobs": env._jobs.snapshot(include_hidden=include_hidden),
784
+ "deadline_pressure": [
785
+ job.job_id for job in env._jobs.deadline_pressure(env.step_count, window=deadline_window)
786
+ ],
787
+ }
788
+
789
+
790
+ @app.get("/cluster/workers")
791
+ def cluster_workers(session_id: str = Query(...)):
792
+ env = _get_cluster_env(session_id)
793
+ return {
794
+ "available": env._workers.available_ids(),
795
+ "trust_snapshot": env._trust.snapshot(),
796
+ "behavioral_fingerprints": env._trust.behavioral_fingerprints(),
797
+ "public_ground_truth_reliability": env._workers.public_ground_truth_reliability(),
798
+ }
799
+
800
+
801
+ @app.get("/cluster/audit")
802
+ def cluster_audit(session_id: str = Query(...)):
803
+ env = _get_cluster_env(session_id)
804
+ return env._audit.snapshot()
805
+
806
+
807
+ @app.get("/cluster/audit/investigate")
808
+ def cluster_audit_investigate(
809
+ session_id: str = Query(...),
810
+ agent_id: str = Query(..., description="Worker public id (S0..S4) or 'cluster'/'adversary'/'auditor'."),
811
+ window: int = Query(10, ge=1, le=240),
812
+ ):
813
+ env = _get_cluster_env(session_id)
814
+ return env._audit.investigate(agent_id, window=window)
815
+
816
+
817
+ @app.get("/cluster/ai-failure-coverage")
818
+ def cluster_ai_failure_coverage(session_id: str = Query(...)):
819
+ env = _get_cluster_env(session_id)
820
+ return env.ai_failure_coverage()
821
+
822
+
823
+ @app.get("/cluster/reward-report")
824
+ def cluster_reward_report(session_id: str = Query(...)):
825
+ env = _get_cluster_env(session_id)
826
+ return env.reward_report()
827
+
828
+
829
+ @app.get("/cluster/stream")
830
+ async def cluster_stream(session_id: str = Query(...)):
831
+ async def event_gen():
832
+ while True:
833
+ env = _sessions.get(session_id)
834
+ if env is None or not isinstance(env, ClusterTrustEnv):
835
+ yield (
836
+ "event: close\n"
837
+ "data: {\"reason\":\"session_not_found_or_not_cluster\"}\n\n"
838
+ )
839
+ break
840
+ yield f"data: {json.dumps(env.stream_snapshot())}\n\n"
841
+ if env.done:
842
+ break
843
+ await asyncio.sleep(0.5)
844
+
845
+ return StreamingResponse(
846
+ event_gen(),
847
+ media_type="text/event-stream",
848
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
849
+ )
850
+
851
+
852
  def _trust_dashboard_html(session_id: str) -> str:
853
  escaped_session = html.escape(session_id, quote=True)
854
  return f"""<!doctype html>
requirements-train.txt CHANGED
@@ -1,10 +1,10 @@
1
  unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
2
- trl<0.13
3
- transformers>=4.46
4
- datasets
5
- accelerate
6
- peft
7
- bitsandbytes
8
  matplotlib
9
  seaborn
10
  pandas
 
1
  unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
2
+ trl>=0.18.2,<0.25,!=0.19.0
3
+ transformers>=4.56,<5
4
+ datasets>=3.0,<5
5
+ accelerate>=1.4
6
+ peft>=0.14
7
+ bitsandbytes>=0.45
8
  matplotlib
9
  seaborn
10
  pandas
training/colab_notebook.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
training/launch_hf_job.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import os
5
+ import shlex
6
+ import sys
7
+ from textwrap import dedent
8
+
9
+ from huggingface_hub import run_job
10
+
11
+
12
+ DEFAULT_IMAGE = "pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel"
13
+ DEFAULT_REPO = "https://github.com/ADITYAGABA1322/sentinel-env"
14
+ DEFAULT_MODEL = "unsloth/Qwen2.5-0.5B-Instruct"
15
+
16
+
17
+ def shell_join(lines: list[str]) -> str:
18
+ return " && ".join(line.strip() for line in lines if line.strip())
19
+
20
+
21
+ def bootstrap_repo(repo_url: str) -> list[str]:
22
+ return [
23
+ "set -eux",
24
+ "command -v git || (apt-get update && apt-get install -y git)",
25
+ f"git clone {shlex.quote(repo_url)} sentinel-env",
26
+ "cd sentinel-env",
27
+ "python -m pip install --upgrade pip",
28
+ "pip install -r requirements.txt",
29
+ "pip install -r requirements-train.txt",
30
+ ]
31
+
32
+
33
+ def gpu_test_command() -> str:
34
+ return "python -c 'import torch; print(torch.cuda.get_device_name())'"
35
+
36
+
37
+ def train_command(args: argparse.Namespace) -> str:
38
+ lines = bootstrap_repo(args.repo_url)
39
+ lines.append(
40
+ " ".join(
41
+ [
42
+ "python training/train.py",
43
+ f"--episodes {args.episodes}",
44
+ f"--task {shlex.quote(args.task)}",
45
+ f"--seed {args.seed}",
46
+ f"--model {shlex.quote(args.model)}",
47
+ f"--epochs {args.epochs}",
48
+ f"--batch-size {args.batch_size}",
49
+ f"--learning-rate {args.learning_rate}",
50
+ f"--lora-rank {args.lora_rank}",
51
+ f"--num-generations {args.num_generations}",
52
+ f"--max-seq-length {args.max_seq_length}",
53
+ f"--output-dir {shlex.quote(args.output_dir)}",
54
+ ]
55
+ )
56
+ )
57
+ if args.mode == "train-full":
58
+ upload_code = (
59
+ "import os; "
60
+ "from huggingface_hub import HfApi; "
61
+ "token=os.environ.get('HF_TOKEN'); "
62
+ "api=HfApi(token=token); "
63
+ "model_repo=os.environ.get('SENTINEL_MODEL_REPO','XcodeAddy/sentinel-grpo-qwen05'); "
64
+ "artifact_repo=os.environ.get('SENTINEL_ARTIFACT_REPO','XcodeAddy/sentinel-env-artifacts'); "
65
+ "job_id=os.environ.get('JOB_ID','manual'); "
66
+ "api.create_repo(model_repo, repo_type='model', exist_ok=True); "
67
+ f"api.upload_folder(folder_path='{args.output_dir}', repo_id=model_repo, repo_type='model'); "
68
+ "api.create_repo(artifact_repo, repo_type='dataset', exist_ok=True); "
69
+ "api.upload_folder(folder_path='outputs', repo_id=artifact_repo, repo_type='dataset', path_in_repo=f'job-{job_id}/outputs'); "
70
+ "print('Uploaded model adapter to', model_repo); "
71
+ "print('Uploaded outputs to', artifact_repo, 'under', f'job-{job_id}/outputs')"
72
+ )
73
+ lines.extend(
74
+ [
75
+ "python -c \"from training.replay import record_trained_actions; "
76
+ f"record_trained_actions(adapter_path='{args.output_dir}', "
77
+ f"base_model='{args.model}', tasks=['task1','task2','task3'], "
78
+ "seeds=range(30), out_path='outputs/trained_policy_replay.jsonl')\"",
79
+ "python training/evaluate.py --episodes 30 --task all "
80
+ "--policies random,heuristic,oracle_lite,trained "
81
+ "--replay outputs/trained_policy_replay.jsonl "
82
+ "--out outputs/eval_post.json --no-plot",
83
+ "cp outputs/eval_post.json outputs/evaluation_results.json",
84
+ "python -m training.plots --pre outputs/eval_pre.json "
85
+ "--post outputs/eval_post.json --out-dir outputs/charts",
86
+ f"python -c {shlex.quote(upload_code)}",
87
+ ]
88
+ )
89
+ return shell_join(lines)
90
+
91
+
92
+ def parse_args() -> argparse.Namespace:
93
+ parser = argparse.ArgumentParser(
94
+ description="Launch SENTINEL training on Hugging Face Jobs without shell quoting pain."
95
+ )
96
+ parser.add_argument("--mode", choices=["gpu-test", "train-smoke", "train-full"], default="gpu-test")
97
+ parser.add_argument("--namespace", default=os.environ.get("HF_NAMESPACE", "XcodeAddy"))
98
+ parser.add_argument("--flavor", default="a10g-small")
99
+ parser.add_argument("--timeout", default="2h")
100
+ parser.add_argument("--image", default=DEFAULT_IMAGE)
101
+ parser.add_argument("--repo-url", default=DEFAULT_REPO)
102
+ parser.add_argument("--model", default=DEFAULT_MODEL)
103
+ parser.add_argument("--episodes", type=int, default=50)
104
+ parser.add_argument("--task", choices=["task1", "task2", "task3", "all"], default="all")
105
+ parser.add_argument("--seed", type=int, default=0)
106
+ parser.add_argument("--epochs", type=int, default=1)
107
+ parser.add_argument("--batch-size", type=int, default=2)
108
+ parser.add_argument("--learning-rate", type=float, default=5e-6)
109
+ parser.add_argument("--lora-rank", type=int, default=8)
110
+ parser.add_argument("--num-generations", type=int, default=2)
111
+ parser.add_argument("--max-seq-length", type=int, default=1024)
112
+ parser.add_argument("--output-dir", default="training/sentinel_qwen05_grpo")
113
+ return parser.parse_args()
114
+
115
+
116
+ def main() -> None:
117
+ args = parse_args()
118
+ token = os.environ.get("HF_TOKEN")
119
+ if not token:
120
+ raise SystemExit(
121
+ dedent(
122
+ """
123
+ HF_TOKEN is not set.
124
+
125
+ Run:
126
+ read -s HF_TOKEN
127
+ export HF_TOKEN
128
+ Then paste your Hugging Face write token.
129
+ """
130
+ ).strip()
131
+ )
132
+
133
+ command = gpu_test_command() if args.mode == "gpu-test" else train_command(args)
134
+ print("Launching HF Job:")
135
+ print(f" mode = {args.mode}")
136
+ print(f" namespace = {args.namespace}")
137
+ print(f" flavor = {args.flavor}")
138
+ print(f" timeout = {args.timeout}")
139
+ print(f" image = {args.image}")
140
+ print(" command = bash -lc", shlex.quote(command[:260] + ("..." if len(command) > 260 else "")))
141
+
142
+ job = run_job(
143
+ image=args.image,
144
+ command=["bash", "-lc", command],
145
+ flavor=args.flavor,
146
+ timeout=args.timeout,
147
+ namespace=args.namespace,
148
+ token=token,
149
+ secrets={"HF_TOKEN": token},
150
+ env={
151
+ "SENTINEL_MODEL_REPO": "XcodeAddy/sentinel-grpo-qwen05",
152
+ "SENTINEL_ARTIFACT_REPO": "XcodeAddy/sentinel-env-artifacts",
153
+ },
154
+ labels={"project": "sentinel", "mode": args.mode},
155
+ )
156
+ print("Job launched.")
157
+ print("URL:", job.url)
158
+ print("ID:", job.id)
159
+ print()
160
+ print("Follow logs with:")
161
+ print(f" .venv/bin/hf jobs logs -f {job.id} --namespace {args.namespace} --token \"$HF_TOKEN\"")
162
+
163
+
164
+ if __name__ == "__main__":
165
+ try:
166
+ main()
167
+ except KeyboardInterrupt:
168
+ sys.exit(130)
training/train.py CHANGED
@@ -174,6 +174,7 @@ def run_grpo(args) -> None:
174
  learning_rate=args.learning_rate,
175
  num_train_epochs=args.epochs,
176
  per_device_train_batch_size=args.batch_size,
 
177
  logging_steps=10,
178
  save_steps=50,
179
  max_prompt_length=args.max_seq_length,
@@ -210,6 +211,7 @@ def main() -> None:
210
  parser.add_argument("--learning-rate", type=float, default=5e-6)
211
  parser.add_argument("--max-seq-length", type=int, default=1024)
212
  parser.add_argument("--lora-rank", type=int, default=16)
 
213
  args = parser.parse_args()
214
 
215
  if args.dry_run:
 
174
  learning_rate=args.learning_rate,
175
  num_train_epochs=args.epochs,
176
  per_device_train_batch_size=args.batch_size,
177
+ num_generations=args.num_generations,
178
  logging_steps=10,
179
  save_steps=50,
180
  max_prompt_length=args.max_seq_length,
 
211
  parser.add_argument("--learning-rate", type=float, default=5e-6)
212
  parser.add_argument("--max-seq-length", type=int, default=1024)
213
  parser.add_argument("--lora-rank", type=int, default=16)
214
+ parser.add_argument("--num-generations", type=int, default=2)
215
  args = parser.parse_args()
216
 
217
  if args.dry_run:
ui/app/components/GPUClusterPanel.tsx CHANGED
@@ -11,63 +11,45 @@ interface GPUNode {
11
  memory: number;
12
  load: number;
13
  status: NodeStatus;
14
- temp: number;
15
  }
16
 
17
  export default function GPUClusterPanel() {
18
  const [mounted, setMounted] = useState(false);
19
  const [nodes, setNodes] = useState<GPUNode[]>([
20
- { id: "GPU-1", utilization: 45, memory: 32, load: 1.2, status: "ACTIVE", temp: 55 },
21
- { id: "GPU-2", utilization: 12, memory: 8, load: 0.4, status: "IDLE", temp: 42 },
22
- { id: "GPU-3", utilization: 88, memory: 64, load: 2.8, status: "ACTIVE", temp: 78 },
23
- { id: "GPU-4", utilization: 0, memory: 0, load: 0, status: "IDLE", temp: 35 },
24
  ]);
25
 
26
  const [avgLoad, setAvgLoad] = useState(0);
27
- const [logs, setLogs] = useState<string[]>([]);
28
- const logRef = useRef<HTMLDivElement>(null);
29
 
30
  useEffect(() => {
31
  setMounted(true);
32
  const interval = setInterval(() => {
 
33
  setNodes((prev) =>
34
  prev.map((node) => {
35
  if (node.status === "FAILED") {
36
- if (Math.random() > 0.95) {
37
- addLog(`[RECOVERY] ${node.id} initialized. Performing self-test...`);
38
- return { ...node, status: "IDLE", utilization: 0, load: 0 };
39
- }
40
  return node;
41
  }
42
-
43
  if (Math.random() > 0.995) {
44
- addLog(`[CRITICAL] ${node.id} core voltage failure! Node offline.`);
45
- return { ...node, status: "FAILED", utilization: 0, memory: 0, load: 0, temp: 20 };
46
  }
47
-
48
  let util = node.utilization + (Math.random() - 0.5) * 15;
49
- if (Math.random() > 0.9) {
50
- util += 35;
51
- addLog(`[SPIKE] Massive compute load detected on ${node.id}.`);
52
- }
53
  util = Math.max(0, Math.min(100, util));
54
-
55
- const mem = Math.max(0, Math.min(100, node.memory + (Math.random() - 0.5) * 8));
56
- const load = (util / 100) * 4.2;
57
- const temp = 35 + (util * 0.5) + (Math.random() * 2);
58
-
59
  let status: NodeStatus = "ACTIVE";
60
- if (util > 92) {
61
- status = "OVERLOADED";
62
- if (node.status !== "OVERLOADED") addLog(`[WARNING] ${node.id} thermal throttling active.`);
63
- }
64
  else if (util < 5) status = "IDLE";
65
-
66
- return { ...node, utilization: util, memory: mem, load, status, temp };
67
  })
68
  );
69
  }, 1500);
70
-
71
  return () => clearInterval(interval);
72
  }, []);
73
 
@@ -76,126 +58,85 @@ export default function GPUClusterPanel() {
76
  setAvgLoad(total / nodes.length);
77
  }, [nodes]);
78
 
79
- const addLog = (msg: string) => {
80
- const time = new Date().toLocaleTimeString('en-US', { hour12: false, hour: '2-digit', minute: '2-digit', second: '2-digit' });
81
- setLogs(prev => [`[${time}] ${msg}`, ...prev].slice(0, 50));
82
- };
83
-
84
- if (!mounted) return null;
 
 
85
 
86
  return (
87
- <section className="section-block crazy-gpu" id="gpu-cluster">
88
- <div className="section-label">03 // COMPUTATIONAL SUBSTRATE</div>
89
- <h2 className="section-title">Nvidia H100 Cluster Telemetry</h2>
90
  <p className="section-desc">
91
- High-fidelity hardware monitoring of the underlying neural inference cluster.
92
- Saturation of these nodes directly impacts trust re-calibration latency.
93
  </p>
94
 
95
- <div className="gpu-layout">
96
- {/* LEFT: NODE GRID */}
97
- <div className="gpu-grid-side">
98
- <div className="cluster-grid">
99
- {nodes.map((node) => (
100
- <div key={node.id} className={`card node-card ${node.status.toLowerCase()} crazy-card`}>
101
- <div className="node-glitch-bg" />
102
- <div className="card-id">{node.id} // CORE-AX-{node.id.split("-")[1]}</div>
103
-
104
- <div className="node-status-badge">
105
- <div className="status-dot" style={{
106
- background: node.status === "OVERLOADED" ? "var(--red)" :
107
- node.status === "FAILED" ? "#555" :
108
- node.status === "IDLE" ? "var(--muted)" : "var(--green)"
109
- }} />
110
- {node.status}
111
- </div>
112
-
113
- {/* VISUAL METER */}
114
- <div className="node-visual">
115
- <svg viewBox="0 0 100 100" className="radial-meter">
116
- <circle cx="50" cy="50" r="45" className="meter-bg" />
117
- <motion.circle
118
- cx="50" cy="50" r="45"
119
- className="meter-fill"
120
- initial={{ pathLength: 0 }}
121
- animate={{ pathLength: node.utilization / 100 }}
122
- style={{ stroke: node.utilization > 90 ? "var(--red)" : "var(--cyan)" }}
123
- />
124
- <text x="50" y="55" className="meter-text">{Math.round(node.utilization)}%</text>
125
- </svg>
126
- </div>
127
-
128
- <div className="node-metrics-stack">
129
- <div className="mini-metric">
130
- <span className="l">MEM</span>
131
- <div className="mini-bar-bg"><motion.div className="mini-bar-fill" animate={{ width: `${node.memory}%` }} /></div>
132
- </div>
133
- <div className="mini-metric">
134
- <span className="l">TMP</span>
135
- <div className="mini-bar-bg"><motion.div className="mini-bar-fill tm" animate={{ width: `${(node.temp / 100) * 100}%` }} /></div>
136
- </div>
137
- </div>
138
 
139
- <div className="node-footer-stats">
140
- <div className="node-stat">
141
- <span className="label">LOAD</span>
142
- <span className="val">{node.load.toFixed(1)} TFLOPS</span>
143
- </div>
144
- <div className="node-stat">
145
- <span className="label">FREQ</span>
146
- <span className="val">{node.status === "FAILED" ? 0 : (2.4 + (node.utilization * 0.01)).toFixed(2)} GHz</span>
147
- </div>
148
- </div>
 
149
  </div>
150
- ))}
151
- </div>
152
- </div>
153
-
154
- {/* RIGHT: SYSTEM LOG & HEATMAP */}
155
- <div className="gpu-sys-side">
156
- <div className="card sys-card">
157
- <div className="card-id">SYS-LOG // KERNEL TELEMETRY</div>
158
- <div className="terminal-log" ref={logRef}>
159
- <AnimatePresence initial={false}>
160
- {logs.map((log, i) => (
161
- <motion.div
162
- key={log + i}
163
- initial={{ opacity: 0, x: -10 }}
164
- animate={{ opacity: 1, x: 0 }}
165
- className="log-line"
166
- >
167
- {log}
168
- </motion.div>
169
- ))}
170
- </AnimatePresence>
171
  </div>
172
- </div>
173
 
174
- <div className="card sys-card heatmap-card">
175
- <div className="card-id">THERMAL // HEATMAP</div>
176
- <div className="heatmap-grid">
177
- {Array.from({ length: 64 }).map((_, i) => (
178
- <motion.div
179
- key={i}
180
- className="heat-cell"
181
- animate={{
182
- opacity: 0.2 + (Math.random() * 0.8),
183
- background: i % 8 < 4 ? "var(--cyan)" : "var(--blue)"
184
- }}
185
- transition={{ repeat: Infinity, duration: 1 + Math.random() * 2, repeatType: "mirror" }}
186
  />
187
- ))}
 
 
 
 
 
 
 
 
 
 
 
188
  </div>
189
- <div className="heatmap-overlay">SCANNING...</div>
190
  </div>
191
- </div>
192
  </div>
193
 
194
- <div className="cluster-footer crazy-footer">
195
  <div className="cluster-total-load">
196
- <span className="label">AGGREGATE CLUSTER PRESSURE</span>
197
  <div className="load-meter-bg">
198
- <motion.div
199
  className="load-meter-fill"
200
  animate={{ width: `${avgLoad}%` }}
201
  style={{ background: avgLoad > 80 ? "var(--red)" : "var(--cyan)", color: avgLoad > 80 ? "var(--red)" : "var(--cyan)" } as any}
@@ -205,10 +146,10 @@ export default function GPUClusterPanel() {
205
  </div>
206
  <div className="cluster-telemetry">
207
  <span>THROUGHPUT: <b>{Math.round(140 - (avgLoad * 0.5))} FPS</b></span>
208
- <span>SYSTEM HEALTH: <b style={{ color: avgLoad > 90 ? "var(--red)" : "var(--green)" }}>{avgLoad > 90 ? "CRITICAL" : "OPTIMAL"}</b></span>
 
209
  </div>
210
  </div>
211
-
212
  </section>
213
  );
214
  }
 
11
  memory: number;
12
  load: number;
13
  status: NodeStatus;
 
14
  }
15
 
16
  export default function GPUClusterPanel() {
17
  const [mounted, setMounted] = useState(false);
18
  const [nodes, setNodes] = useState<GPUNode[]>([
19
+ { id: "GPU-1", utilization: 45, memory: 32, load: 1.2, status: "ACTIVE" },
20
+ { id: "GPU-2", utilization: 12, memory: 8, load: 0.4, status: "IDLE" },
21
+ { id: "GPU-3", utilization: 88, memory: 64, load: 2.8, status: "ACTIVE" },
22
+ { id: "GPU-4", utilization: 0, memory: 0, load: 0, status: "IDLE" },
23
  ]);
24
 
25
  const [avgLoad, setAvgLoad] = useState(0);
26
+ const [jitter, setJitter] = useState(0.45);
 
27
 
28
  useEffect(() => {
29
  setMounted(true);
30
  const interval = setInterval(() => {
31
+ setJitter(Math.random() * 2);
32
  setNodes((prev) =>
33
  prev.map((node) => {
34
  if (node.status === "FAILED") {
35
+ if (Math.random() > 0.95) return { ...node, status: "IDLE", utilization: 0, load: 0 };
 
 
 
36
  return node;
37
  }
 
38
  if (Math.random() > 0.995) {
39
+ return { ...node, status: "FAILED", utilization: 0, memory: 0, load: 0 };
 
40
  }
 
41
  let util = node.utilization + (Math.random() - 0.5) * 15;
42
+ if (Math.random() > 0.9) util += 30;
 
 
 
43
  util = Math.max(0, Math.min(100, util));
44
+ const mem = Math.max(0, Math.min(100, node.memory + (Math.random() - 0.5) * 5));
45
+ const load = (util / 100) * 4;
 
 
 
46
  let status: NodeStatus = "ACTIVE";
47
+ if (util > 90) status = "OVERLOADED";
 
 
 
48
  else if (util < 5) status = "IDLE";
49
+ return { ...node, utilization: util, memory: mem, load, status };
 
50
  })
51
  );
52
  }, 1500);
 
53
  return () => clearInterval(interval);
54
  }, []);
55
 
 
58
  setAvgLoad(total / nodes.length);
59
  }, [nodes]);
60
 
61
+ if (!mounted) {
62
+ return (
63
+ <section className="section-block" id="gpu-cluster" style={{ opacity: 0 }}>
64
+ <div className="section-label">03 // COMPUTE RESOURCES</div>
65
+ <h2 className="section-title">GPU Compute Clusters</h2>
66
+ </section>
67
+ );
68
+ }
69
 
70
  return (
71
+ <section className="section-block" id="gpu-cluster">
72
+ <div className="section-label">03 // COMPUTE RESOURCES</div>
73
+ <h2 className="section-title">GPU Compute Clusters</h2>
74
  <p className="section-desc">
75
+ Real-time telemetry from the underlying inference hardware.
76
+ High cluster utilization may introduce latency in the trust calibration loop.
77
  </p>
78
 
79
+ <div className="cluster-grid">
80
+ {nodes.map((node) => (
81
+ <div key={node.id} className={`card node-card ${node.status.toLowerCase()}`}>
82
+ <div className="card-id">{node.id} // NODE-0{node.id.split("-")[1]}</div>
83
+
84
+ <div className="node-status-badge">
85
+ <div className="status-dot" style={{
86
+ background: node.status === "OVERLOADED" ? "var(--red)" :
87
+ node.status === "FAILED" ? "#555" :
88
+ node.status === "IDLE" ? "var(--muted)" : "var(--green)"
89
+ }} />
90
+ {node.status}
91
+ </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ <div className="metric-bar-wrap" style={{ marginTop: 20 }}>
94
+ <div className="metric-bar-label">
95
+ <span>UTILIZATION</span>
96
+ <span style={{ color: "var(--cyan)" }}>{Math.round(node.utilization)}%</span>
97
+ </div>
98
+ <div className="metric-bar-bg">
99
+ <motion.div
100
+ className="metric-bar-fill"
101
+ animate={{ width: `${node.utilization}%` }}
102
+ style={{ background: node.utilization > 90 ? "var(--red)" : "var(--cyan)" } as any}
103
+ />
104
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  </div>
 
106
 
107
+ <div className="metric-bar-wrap" style={{ marginTop: 12 }}>
108
+ <div className="metric-bar-label">
109
+ <span>MEMORY USAGE</span>
110
+ <span style={{ color: "var(--green)" }}>{Math.round(node.memory)}%</span>
111
+ </div>
112
+ <div className="metric-bar-bg">
113
+ <motion.div
114
+ className="metric-bar-fill"
115
+ animate={{ width: `${node.memory}%` }}
116
+ style={{ background: "var(--green)" } as any}
 
 
117
  />
118
+ </div>
119
+ </div>
120
+
121
+ <div className="node-footer-stats">
122
+ <div className="node-stat">
123
+ <span className="label">COMPUTE</span>
124
+ <span className="val">{node.load.toFixed(1)} TFLOPS</span>
125
+ </div>
126
+ <div className="node-stat">
127
+ <span className="label">TEMP</span>
128
+ <span className="val">{Math.round(40 + (node.utilization * 0.4))}Β°C</span>
129
+ </div>
130
  </div>
 
131
  </div>
132
+ ))}
133
  </div>
134
 
135
+ <div className="cluster-footer">
136
  <div className="cluster-total-load">
137
+ <span className="label">TOTAL CLUSTER LOAD</span>
138
  <div className="load-meter-bg">
139
+ <motion.div
140
  className="load-meter-fill"
141
  animate={{ width: `${avgLoad}%` }}
142
  style={{ background: avgLoad > 80 ? "var(--red)" : "var(--cyan)", color: avgLoad > 80 ? "var(--red)" : "var(--cyan)" } as any}
 
146
  </div>
147
  <div className="cluster-telemetry">
148
  <span>THROUGHPUT: <b>{Math.round(140 - (avgLoad * 0.5))} FPS</b></span>
149
+ <span>LATENCY: <b>{Math.round(12 + (avgLoad * 0.2))}ms</b></span>
150
+ <span>JITTER: <b>{jitter.toFixed(2)}ms</b></span>
151
  </div>
152
  </div>
 
153
  </section>
154
  );
155
  }
ui/app/hooks/useSentinel.ts CHANGED
@@ -9,6 +9,8 @@ import type {
9
 
10
  /* ── helpers ──────────────────────────────────────────── */
11
 
 
 
12
  function bestSpec(obs: Observation | null): string {
13
  if (!obs) return "S0";
14
  return [...obs.available_specialists].sort(
@@ -92,12 +94,12 @@ export function useSentinel() {
92
 
93
  /* load evaluation data once */
94
  useEffect(() => {
95
- fetch(`${process.env.NEXT_PUBLIC_API_URL}/assets/evaluation_results.json`)
96
  .then((r) => r.json())
97
  .then(setEval)
98
  .catch(() => null);
99
 
100
- fetch(`${process.env.NEXT_PUBLIC_API_URL}/assets/trained_policy_replay.jsonl`)
101
  .then((r) => r.ok ? r.text() : "")
102
  .then((txt) => {
103
  const table = new Map<string, ReplayRow>();
 
9
 
10
  /* ── helpers ──────────────────────────────────────────── */
11
 
12
+ const API_BASE = process.env.NEXT_PUBLIC_API_URL || "";
13
+
14
  function bestSpec(obs: Observation | null): string {
15
  if (!obs) return "S0";
16
  return [...obs.available_specialists].sort(
 
94
 
95
  /* load evaluation data once */
96
  useEffect(() => {
97
+ fetch(`${API_BASE}/assets/evaluation_results.json`)
98
  .then((r) => r.json())
99
  .then(setEval)
100
  .catch(() => null);
101
 
102
+ fetch(`${API_BASE}/assets/trained_policy_replay.jsonl`)
103
  .then((r) => r.ok ? r.text() : "")
104
  .then((txt) => {
105
  const table = new Map<string, ReplayRow>();
ui/app/page.tsx CHANGED
@@ -293,7 +293,7 @@ export default function Page() {
293
  </div>
294
  <div className="footer-right">
295
  BUILD 2.4.1 // MARL-FRAMEWORK // MIT LICENSE<br />
296
- Β© 2025 SENTINEL LAB. ALL RIGHTS RESERVED.
297
  </div>
298
  </footer>
299
  </>
 
293
  </div>
294
  <div className="footer-right">
295
  BUILD 2.4.1 // MARL-FRAMEWORK // MIT LICENSE<br />
296
+ Β© 2025 THE_BOYS. ALL RIGHTS RESERVED.
297
  </div>
298
  </footer>
299
  </>