"""FastAPI + Gradio server for the GraphStrike OpenEnv environment.""" from __future__ import annotations import json import os import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent.parent)) from fastapi import Body, FastAPI, HTTPException from fastapi.responses import HTMLResponse, RedirectResponse from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles from pydantic import BaseModel from typing import Any, Dict, Optional from models import FakeGangAction, FakeGangObservation, FakeGangState, ActionType from environment import FakeGangEnvironment # --------------------------------------------------------------------------- # App + environment # --------------------------------------------------------------------------- app = FastAPI( title="GraphStrike — OpenEnv", description="RL environment for detecting coordinated fake account rings in social networks.", version="1.0.0", ) app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) # Serve project images at /static/* and /img/* (NOT /assets/ — Gradio uses that path for its own JS/CSS) _PROJECT_ROOT = Path(__file__).parent.parent _ASSETS_DIR = _PROJECT_ROOT / "assets" _IMAGES_DIR = _PROJECT_ROOT / "images" if _ASSETS_DIR.exists(): app.mount("/static", StaticFiles(directory=str(_ASSETS_DIR)), name="static") if _IMAGES_DIR.exists(): app.mount("/img", StaticFiles(directory=str(_IMAGES_DIR)), name="img") _env = FakeGangEnvironment() class ResetRequest(BaseModel): task: str = "easy" seed: Optional[int] = None episode_id: Optional[str] = None class StepResponse(BaseModel): observation: Dict[str, Any] done: bool reward: Optional[float] message: str # --------------------------------------------------------------------------- # OpenEnv API endpoints # --------------------------------------------------------------------------- @app.get("/health") def health(): return {"status": "healthy"} @app.post("/reset", response_model=StepResponse) def reset(req: Optional[ResetRequest] = Body(default=None)): if req is None: req = ResetRequest() obs = _env.reset(task=req.task, seed=req.seed, episode_id=req.episode_id) return StepResponse(observation=obs.model_dump(), done=obs.done, reward=obs.reward, message=obs.message) @app.post("/step", response_model=StepResponse) def step(action: FakeGangAction): obs = _env.step(action) return StepResponse(observation=obs.model_dump(), done=obs.done, reward=obs.reward, message=obs.message) @app.get("/state") def state(): return _env.state.model_dump() @app.get("/tasks") def list_tasks(): _formula = ( "if recall >= win_recall and precision >= win_precision: " "score = 0.55 + 0.20*recall + 0.15*precision + 0.10*efficiency " "else: score = 0.30*recall + 0.10*precision" ) return { "tasks": [ { "name": "easy", "description": "50 accounts, 10 fakes, no evasion, 30 steps", "max_steps": 30, "grader": { "endpoint": "/grader", "score_range": [0.0, 1.0], "win_threshold": 0.815, "win_conditions": {"recall": 0.8, "precision": 0.7}, "formula": _formula, }, }, { "name": "medium", "description": "200 accounts, 10 fakes + 20 decoys, evasion at step 20, 50 steps", "max_steps": 50, "grader": { "endpoint": "/grader", "score_range": [0.0, 1.0], "win_threshold": 0.815, "win_conditions": {"recall": 0.8, "precision": 0.7}, "formula": _formula, }, }, { "name": "hard", "description": "1000 accounts, 10 fakes + 50 decoys, recurring evasion, 80 steps", "max_steps": 80, "grader": { "endpoint": "/grader", "score_range": [0.0, 1.0], "win_threshold": 0.868, "win_conditions": {"recall": 0.9, "precision": 0.8}, "formula": _formula, }, }, ], "action_schema": { "action_type": ["inspect", "investigate_network", "flag", "unflag", "submit"], "account_id": "string (required for all actions except submit)", }, "score_range": [0.0, 1.0], } @app.get("/grader") def grader(): if not _env._done: raise HTTPException(status_code=400, detail="Episode not complete. Call SUBMIT first.") return {"score": _env._last_grader_score, "task": _env._task, "episode_id": _env._episode_id} @app.get("/metadata") def metadata(): return { "name": "graphstrike", "version": "1.0.0", "author": "Pandago", "description": "RL environment for detecting coordinated fake account rings in social networks.", "tags": ["social-network", "fraud-detection", "graph", "rl"], } @app.get("/schema") def schema(): return { "action": FakeGangAction.model_json_schema(), "observation": FakeGangObservation.model_json_schema(), "state": FakeGangState.model_json_schema(), } @app.post("/mcp") def mcp(body: Dict[str, Any] = {}): method = body.get("method", "") req_id = body.get("id", 1) if method == "tools/list": return {"jsonrpc": "2.0", "id": req_id, "result": {"tools": [ {"name": "reset", "description": "Reset the environment", "inputSchema": {"type": "object", "properties": {"task": {"type": "string"}, "seed": {"type": "integer"}}}}, {"name": "step", "description": "Take an action", "inputSchema": FakeGangAction.model_json_schema()}, {"name": "state", "description": "Get episode state", "inputSchema": {"type": "object", "properties": {}}}, ]}} return {"jsonrpc": "2.0", "id": req_id, "result": {"name": "graphstrike", "version": "1.0.0", "protocolVersion": "2024-11-05"}} @app.api_route("/baseline", methods=["GET", "POST"]) def baseline(): sys.path.insert(0, str(Path(__file__).parent.parent)) from inference import run_rule_based_episode scores = {} for task in ["easy", "medium", "hard"]: scores[task] = run_rule_based_episode(_env, task=task, seed=0) return {"scores": scores, "agent": "rule_based"} # HF Spaces probes /web — redirect to root (must be on FastAPI before Gradio mount) @app.get("/web", response_class=RedirectResponse) def web_redirect(): return RedirectResponse(url="/") # --------------------------------------------------------------------------- # Gradio UI # --------------------------------------------------------------------------- import pandas as pd # ── Benchmark data ─────────────────────────────────────────────────────────── BENCH_SEED0 = [ # [Model, Params, Easy, Medium, Hard, Mean] — sorted by Mean desc ["Llama 4 Scout 17B", "17B", 0.960, 0.979, 0.976, 0.972], ["Ministral 3 8B", "8B", 0.967, 0.964, 0.964, 0.965], ["DeepSeek V3.2", "685B", 0.967, 0.960, 0.933, 0.953], ["Nemotron Super 3", "49B", 0.930, 0.941, 0.964, 0.945], ["Rule-Based Baseline","—", 0.910, 0.906, 0.904, 0.907], ["Gemma 3 12B", "12B", 0.900, 0.908, 0.908, 0.905], ] BENCH_VARIANCE = [ # [Model, Easy mean, Easy var, Med mean, Med var, Hard mean, Hard var] ["Llama 4 Scout 17B", 0.960, 0.000007, 0.979, 0.000001, 0.976, 0.000063], ["Nemotron Super 3", 0.957, 0.000, 0.957, 0.000, 0.645, 0.208], ["Ministral 3 8B", 0.958, 0.000, 0.645, 0.208, 0.623, 0.195], ["DeepSeek V3.2", 0.640, 0.205, 0.957, 0.000, 0.645, 0.208], ["Gemma 3 12B", 0.912, 0.000, 0.917, 0.000, 0.603, 0.182], ] PROFILE_HEADERS = ["Account", "Status", "Risk", "Node", "Beh", "Graph", "Hub", "Photo", "Bio", "IP", "F.Nbrs"] # Long-format DataFrame for BarPlot _bench_long_rows = [] for _r in BENCH_SEED0: _bench_long_rows += [ {"Model": _r[0], "Task": "Easy", "Score": _r[2]}, {"Model": _r[0], "Task": "Medium", "Score": _r[3]}, {"Model": _r[0], "Task": "Hard", "Score": _r[4]}, ] BENCH_LONG_DF = pd.DataFrame(_bench_long_rows) # ── HTML table builders ────────────────────────────────────────────────────── def _score_color(s: float) -> str: if s >= 0.960: return "#22c55e" if s >= 0.930: return "#86efac" if s >= 0.910: return "#facc15" return "#f97316" def _var_color(v: float) -> str: if v < 0.001: return "#22c55e" if v < 0.05: return "#facc15" return "#f87171" _TH = "padding:11px 16px;font-weight:600;white-space:nowrap;" _TD = "padding:10px 16px;white-space:nowrap;" _TABLE_WRAP = ( "overflow-x:auto;border-radius:10px;border:1px solid #1e3a5f;" "font-family:'IBM Plex Mono',monospace;font-size:13.5px;" ) _THEAD_BG = "background:#0c2340;" def _leaderboard_html() -> str: header = ( f"" f"#" f"Model" f"Params" f"Easy" f"Medium" f"Hard" f"Mean" f"" ) rows = "" for i, r in enumerate(BENCH_SEED0): bg = "#162032" if i % 2 == 0 else "#0f172a" is_base = r[0] == "Rule-Based Baseline" name_cell = ( f"{r[0]} (baseline)" if is_base else r[0] ) name_color = "#94a3b8" if is_base else "#e2e8f0" rows += ( f"" f"{i+1}" f"{name_cell}" f"{r[1]}" + "".join( f"{r[j]:.3f}" for j in (2, 3, 4) ) + f"{r[5]:.3f}" f"" ) return f"
{header}{rows}
" def _variance_html() -> str: header = ( f"" f"Model" f"Easy — mean / var" f"Medium — mean / var" f"Hard — mean / var" f"" ) rows = "" for i, r in enumerate(BENCH_VARIANCE): bg = "#162032" if i % 2 == 0 else "#0f172a" def cell(mean, var): return ( f"" f"{mean:.3f}" f" / {var:.1e}" f"" ) rows += ( f"" f"{r[0]}" + cell(r[1], r[2]) + cell(r[3], r[4]) + cell(r[5], r[6]) + "" ) return f"
{header}{rows}
" def _baseline_html() -> str: rows_data = [ ("Easy", 0.9100, "100%", "#4ade80"), ("Medium", 0.9060, "84%", "#facc15"), ("Hard", 0.9038, "52%", "#f87171"), ] header = ( f"" f"Task" f"Score (seed=0)" f"Win Rate (50 seeds)" f"" ) rows = "" for i, (task, score, wr, col) in enumerate(rows_data): bg = "#162032" if i % 2 == 0 else "#0f172a" rows += ( f"" f"{task}" f"{score:.4f}" f"{wr}" f"" ) return f"
{header}{rows}
" try: import gradio as gr # ── Observation / profile helpers ───────────────────────────────────────── def _fmt_obs(d: dict) -> str: lines = [] task = d.get('task', '?').upper() done = d.get('done', False) steps = d.get('steps_remaining', '?') state_label = "Done" if done else "In Progress" lines.append(f"### Task: **{task}** | Steps remaining: **{steps}** | {state_label}") if d.get('reward') is not None: lines.append(f"**Final Reward:** `{d['reward']:.2f}`") fl = d.get('flagged_ids', []) lines.append(f"**Flagged ({len(fl)}/10):** " + (" ".join(f"`{f}`" for f in fl) if fl else "*none*")) su = d.get('suspect_ids', []) ins = set(d.get('inspected_ids', [])) uninspected_sus = [s for s in su if s not in ins] if uninspected_sus: lines.append(f"**Suspects — uninspected ({len(uninspected_sus)}):** " + " ".join(f"`{s}`" for s in uninspected_sus)) lines.append(f"**Visible:** {len(d.get('visible_account_ids',[]))} IDs | **Inspected:** {len(d.get('inspected_ids',[]))} accounts") if d.get('evasion_triggered'): lines.append(f"**Evasion events fired:** {d.get('evasion_count', 0)}") lines.append(f"\n> {d.get('message', '')}") return "\n\n".join(lines) def _profile_rows(d: dict) -> list: accs = d.get("visible_accounts", []) if not accs: return [] STATUS_MAP = { "confirmed_fake": "confirmed_fake [flagged]", "suspect": "suspect", "normal": "normal", } rows = [] for a in sorted(accs, key=lambda x: x.get("fake_risk_score", 0), reverse=True)[:40]: rows.append([ a.get("account_id", ""), STATUS_MAP.get(a.get("status", ""), a.get("status", "")), round(a.get("fake_risk_score", 0), 3), round(a.get("node_risk", 0), 3), round(a.get("behavior_risk", 0), 3), round(a.get("graph_risk", 0), 3), round(a.get("hub_legitimacy_score", 0), 3), round(a.get("photo_reuse_score", 0), 3), round(a.get("bio_template_score", 0), 3), a.get("shared_ip_count", 0), a.get("flagged_neighbor_count", 0), ]) return rows def _fmt_visible_ids(d: dict) -> str: ins = set(d.get('inspected_ids', [])) suspects = set(d.get('suspect_ids', [])) flagged = set(d.get('flagged_ids', [])) visible = d.get('visible_account_ids', []) if not visible: return "*No visible accounts yet.*" parts = [] for vid in visible: if vid in flagged: parts.append(f"**[F]** `{vid}`") elif vid in suspects and vid not in ins: parts.append(f"**[S]** `{vid}`") elif vid in ins: parts.append(f"`{vid}`") else: parts.append(f"`{vid}`") return " ".join(parts) # ── Playground callbacks ────────────────────────────────────────────────── def gr_reset(task, seed): try: obs = _env.reset(task=task, seed=int(seed)) d = obs.model_dump() return _fmt_obs(d), _profile_rows(d), _fmt_visible_ids(d), json.dumps(d, indent=2, default=str) except Exception as e: return f"**Error:** {e}", [], "", "{}" def gr_step(action_type, account_id): try: acc = account_id.strip() if action_type != "submit" else None action = FakeGangAction(action_type=ActionType(action_type), account_id=acc) obs = _env.step(action) d = obs.model_dump() return _fmt_obs(d), _profile_rows(d), _fmt_visible_ids(d), json.dumps(d, indent=2, default=str) except Exception as e: return f"**Error:** {e}", [], "", "{}" def gr_grader(): if not _env._done: return "Episode not complete — call SUBMIT first." return ( f"**Score:** `{_env._last_grader_score:.4f}` | " f"**Task:** {_env._task} | " f"**Episode:** `{_env._episode_id}`" ) def gr_baseline(): sys.path.insert(0, str(Path(__file__).parent.parent)) from inference import run_rule_based_episode scores = {t: run_rule_based_episode(_env, task=t, seed=0) for t in ["easy", "medium", "hard"]} mean = sum(scores.values()) / 3 return ( f"**Baseline (rule-based, seed=0)**\n\n" f"Easy: `{scores['easy']:.4f}` | Medium: `{scores['medium']:.4f}` | " f"Hard: `{scores['hard']:.4f}` | Mean: `{mean:.4f}`" ) # ── Build Gradio UI ─────────────────────────────────────────────────────── # ── README content (rendered as styled HTML) ───────────────────────────── _README_HTML = """

What is GraphStrike?

An OpenEnv-compatible reinforcement learning environment where an LLM agent must identify all 10 members of a coordinated fake account ring hidden inside a synthetic social network. The agent learns via Reflexion and a dynamic hybrid rule/LLM policy — no gradient updates, no fine-tuning required.

Submitted to the OpenEnv Hackathon × SCALER School of Technology. Judges deploy this container, run their own LLM agent against it, and score on task quality, environment design, code quality, creativity, and domain quality.

10Gang members to find per episode
3Difficulty tiers (easy / medium / hard)
150Pre-generated episodes (50 per task)
24Automated validator checks

System Architecture

System Architecture

End-to-end pipeline: episode generation → environment server → hybrid agent → reflexion memory

Task Difficulty Tiers

TaskNetwork SizeGangDecoysMax StepsWin ConditionBaseline Score
Easy50 accounts10030Recall ≥ 0.8, Precision ≥ 0.70.910
Medium200 accounts102050Recall ≥ 0.8, Precision ≥ 0.70.906
Hard1000 accounts105080Recall ≥ 0.9, Precision ≥ 0.80.904

Hard mode fires 4 evasion events (steps 15, 30, 45, 60) that drop intra-gang follow edges mid-investigation, destroying graph signals.


Detection Signal Hierarchy

Signal Hierarchy

Node signals (offline) → Behavioral signals (temporal/device) → Graph signals (live at INSPECT) → False-positive control via hub legitimacy

Node Signals (pre-computed offline)

FeatureFake RangeReal RangeWhat it measures
photo_reuse_score0.30 – 0.950.00 – 0.15Stolen celebrity photos via pHash fingerprint matching
bio_template_score0.20 – 0.900.00 – 0.12Cosine similarity to known fake bio templates
comment_repeat_score0.60 – 0.900.00 – 0.08Fraction of copy-pasted spam comments across accounts

Behavioral Signals (temporal + device)

FeatureFake Pattern
avg_post_hourAll 10 gang members post within ±0.5h of each other (coordinated scheduling)
account_age_daysCreated same week — base_age ± 7 days
shared_ip_count= 9 for all gang members (one IP subnet per episode, unique seed)

Graph Signals (computed live at INSPECT)

FeatureFake Pattern
mutual_follow_rate0.6 – 0.9 (dense intra-gang mutual follows)
flagged_neighbor_countGrows as investigation proceeds — strongest late-game signal
avg_neighbor_photo_reuseHigh when cluster shares stolen content

Episode Lifecycle & Action Mechanics

Episode Flow

Episode flow: reset → inspect/flag/investigate loop → dual SUSPECT cascade → submit → grader score

Action Space

ActionStep CostEffect
INSPECT acc_XXXX1 stepReveals full AccountProfile + follow list; adds 1-hop neighbors to visible set
INVESTIGATE_NETWORK acc_XXXX2 stepsBidirectional 2-hop expansion (outgoing + incoming edges); re-cascades SUSPECT
FLAG acc_XXXXFREEMarks as fake; triggers dual SUSPECT cascade (follow-graph + IP cluster)
UNFLAG acc_XXXXFREERemoves flag; clears CONFIRMED_FAKE status
SUBMITFREEEnds episode; triggers grader scoring

Dual SUSPECT Cascade (triggered by FLAG)

Cascade 1 — Follow-Graph

Every account the flagged member follows (_live_edges) becomes SUSPECT if visible and NORMAL. Gang follow density is 0.70+ so this is high-precision.

Cascade 2 — IP Cluster

Every visible account sharing the same ip_cluster_id becomes SUSPECT. Gang shares ip_gang_<seed>; real accounts have unique IPs. Zero false positives.


Risk Scoring Mathematics

Risk Scoring Overview

All scoring functions are stateless and deterministic — called inside _build_profile() at every INSPECT

Risk Formulas Part 1

Node risk, Behavior risk, Graph risk components

Risk Formulas Part 2

Hub legitimacy, Composite fake_risk_score formula

fake_risk = clip( 0.30 × node_risk ← content signals (photo reuse, bio templates) + 0.25 × behavior_risk ← temporal + age clustering + 0.45 × graph_risk ← structural coordination (highest weight — hardest to fake) − 0.25 × hub_legitimacy, ← subtractive: celebrities score ≈ 0 before clip 0.0, 1.0)

Grader Score Formula

recall = tp / 10 precision = tp / max(tp + fp, 1) efficiency = max(0, (max_steps − steps_used) / max_steps) if recall ≥ 0.8 and precision ≥ 0.7: score = 0.55 + 0.20×recall + 0.15×precision + 0.10×efficiency else: score = 0.30×recall + 0.10×precision # Maximum possible: 1.00 | Win threshold: ~0.815

Reflexion Learning

Reflexion Learning Loop

Post-episode lessons injected into every future prompt — learning without weight updates

The LLM (Qwen3-80B via AWS Bedrock) cannot be fine-tuned — it is a black-box API. Instead, a separate Qwen3 call generates a 2–3 sentence lesson after each episode. The best winning trajectory is stored as a few-shot example injected into all future prompts.

Episode N:
  LLM acts using: system_prompt + reflections[last 4] + best_trajectory
  Episode ends → WIN or LOSS
  LOSS → generate_reflection(action_log, outcome) → lesson stored
  WIN  → save trajectory if better reward + generate_success_reflection

Episode N+1:
  last 4 reflections + best win trajectory injected into prompt
  → LLM has learned from its past without any weight updates

Hybrid Policy — The Novel Contribution

Hybrid Policy Architecture

Dynamic alpha-weighted blend: rules dominate early, LLM earns trust through wins and reflections

A dynamic α-weighted blend of a deterministic rule engine and the LLM. α represents trust in the LLM — starts at 0.20 (rules dominate), climbs as the LLM wins consistently and accumulates reflections, capped per task to prevent the LLM from overriding correct high-confidence rule decisions.

reflection_factor = min(1.0, n_reflections / 4.0) raw = 0.20 + reflection_factor × (0.80 × recent_win_rate + 0.12) alpha = clamp(raw, 0.20, task_cap) Per-task caps: easy → 0.50 | medium → 0.70 | hard → 0.85
Alpha progression over training

Alpha progression: rule-dominated early training → LLM earns authority through wins

Rule Confidence Levels

SituationRule ActionConfidence
Steps remaining = 0SUBMIT1.00
Uninspected SUSPECT accounts existINSPECT suspects[0]0.95
fake_risk ≥ 0.85FLAG that account0.95
fake_risk in [threshold, 0.85)FLAG that account0.70 – 0.94
10 flags placedSUBMIT0.85
Steps remaining ≤ 3SUBMIT0.90
Uninspected accounts availableINSPECT top candidate0.30

When rule_confidence ≥ alpha the rule engine overrides. At easy cap (0.50), the LLM controls only exploratory INSPECT decisions. At hard cap (0.85), the LLM controls most decisions except forced submits and suspect cascade.

""" _HEADER_HTML = """

GraphStrike

COORDINATED FAKE ACCOUNT RING DETECTION — OPENENV RL ENVIRONMENT

OpenEnv Hackathon Reinforcement Learning Hybrid Policy Reflexion Learning Fraud Detection
""" _FOOTER_HTML = """
GraphStrike — OpenEnv Hackathon × SCALER School of Technology  |  API Docs
""" with gr.Blocks(title="GraphStrike") as demo: gr.HTML(_HEADER_HTML) with gr.Tabs(): # ══════════════ TAB 1: README ══════════════ with gr.Tab("Overview"): gr.HTML(_README_HTML) # ══════════════ TAB 2: PLAYGROUND ══════════════ with gr.Tab("Playground"): with gr.Row(): with gr.Column(scale=1, min_width=220): gr.Markdown("**1 — Episode**") task_dd = gr.Dropdown(["easy","medium","hard"], value="easy", label="Task") seed_in = gr.Number(value=0, label="Seed", precision=0) reset_btn = gr.Button("Reset", variant="primary") with gr.Column(scale=1, min_width=220): gr.Markdown("**2 — Action**") action_dd = gr.Dropdown( ["inspect","investigate_network","flag","unflag","submit"], value="inspect", label="Action") acc_in = gr.Textbox(label="Account ID", placeholder="acc_0012") step_btn = gr.Button("Step", variant="primary") with gr.Column(scale=1, min_width=180): gr.Markdown("**3 — Score**") gr.Markdown("
", container=False) grader_btn = gr.Button("Grader Score", size="sm") baseline_btn = gr.Button("Baseline Agent", size="sm") gr.Button("API Docs (Swagger)", size="sm", link="/docs", link_target="_blank") obs_md = gr.Markdown(value="*Reset an episode to begin.*") gr.Markdown("**Account Profiles** — sorted by fake risk score (highest first)") prof_table = gr.Dataframe( headers=PROFILE_HEADERS, datatype=["str","str","number","number","number","number", "number","number","number","number","number"], value=[], interactive=False, wrap=False, column_widths=["110px","160px","70px","70px","70px", "70px","70px","70px","70px","55px","70px"], ) result_md = gr.Markdown(value="") with gr.Accordion("All Visible IDs", open=False): vis_md = gr.Markdown(value="") with gr.Accordion("Raw JSON", open=False): raw_json = gr.Textbox(lines=20, interactive=False) reset_btn.click(gr_reset, [task_dd, seed_in], [obs_md, prof_table, vis_md, raw_json]) step_btn.click( gr_step, [action_dd, acc_in], [obs_md, prof_table, vis_md, raw_json]) grader_btn.click(gr_grader, [], result_md) baseline_btn.click(gr_baseline,[], result_md) # ══════════════ TAB 2: BENCHMARKS ══════════════ with gr.Tab("Benchmarks"): gr.Markdown( "### LLM Agent Evaluation — GraphStrike Environment\n" "Agents evaluated with identical system prompts and structured inference. " "Grader score range: **0.0 – 1.0** (win threshold ≥ 0.815). " "Score colours: " " ≥0.960   " " ≥0.930   " " ≥0.910   " " below", sanitize_html=False, ) gr.Markdown("#### Leaderboard — Single Seed (seed=0)") gr.HTML(_leaderboard_html()) gr.Markdown("#### Score Distribution by Task") gr.BarPlot( value=BENCH_LONG_DF, x="Model", y="Score", color="Task", title="Agent Scores by Task (seed=0)", color_map={"Easy": "#4ade80", "Medium": "#facc15", "Hard": "#f87171"}, y_lim=[0.50, 1.0], x_label_angle=-25, height=340, ) gr.Markdown( "#### Stability — 3-Seed Variance Check (seeds 0, 1, 2)\n" "Variance colour: " " stable (<0.001)   " " moderate   " " high", sanitize_html=False, ) gr.HTML(_variance_html()) gr.Markdown("#### Rule-Based Baseline (no LLM, deterministic)") gr.HTML(_baseline_html()) gr.Markdown( "#### Key Observations\n" "- Hard task is the real differentiator — evasion events destroy graph signals " "mid-investigation, requiring adaptive reasoning beyond memorised patterns.\n" "- Llama 4 Scout 17B achieves the lowest variance on hard (6e-5), " "outperforming models with 40× more parameters.\n" "- The rule-based baseline is competitive at mean 0.907, confirming " "the environment's signal quality. LLM value is in evasion adaptation.\n" "- All frontier models exceed 0.93 on easy/medium — cascade mechanics " "are learnable from the structured observation format." ) gr.HTML(_FOOTER_HTML) app = gr.mount_gradio_app(app, demo, path="/") print("[GraphStrike] Gradio UI mounted at /", flush=True) except Exception as exc: import traceback print(f"[GraphStrike] Gradio unavailable: {exc}", flush=True) traceback.print_exc() @app.get("/", response_class=HTMLResponse) def root_fallback(): return "

GraphStrike

API mode. Swagger

" # --------------------------------------------------------------------------- # Entry point # --------------------------------------------------------------------------- def main(): import uvicorn port = int(os.environ.get("PORT", 7860)) print(f"[GraphStrike] Starting on port {port}", flush=True) uvicorn.run("server.app:app", host="0.0.0.0", port=port, log_level="info", workers=1) if __name__ == "__main__": main()