Spaces:
Runtime error
Runtime error
| from __future__ import annotations | |
| import json | |
| import os | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional | |
| import importlib.util | |
| import re | |
| import gradio as gr | |
| # Ensure Milestone 5 evaluation utilities are importable when running inside the Space. | |
| REPO_ROOT = Path(__file__).resolve().parents[3] | |
| EVAL_DIR = REPO_ROOT / "Milestone-5" / "router-agent" | |
| if EVAL_DIR.exists(): | |
| sys.path.insert(0, str(EVAL_DIR)) | |
| try: | |
| from schema_score import ( # type: ignore | |
| run_schema_evaluation, | |
| tool_sequence, | |
| todo_covers_all_tools, | |
| todo_tool_alignment, | |
| ) | |
| except Exception as exc: # pragma: no cover - handled gracefully in UI. | |
| run_schema_evaluation = None | |
| tool_sequence = None | |
| todo_covers_all_tools = None | |
| todo_tool_alignment = None | |
| SCHEMA_IMPORT_ERROR = str(exc) | |
| else: | |
| SCHEMA_IMPORT_ERROR = "" | |
| try: | |
| from router_benchmark_runner import ( # type: ignore | |
| load_thresholds, | |
| evaluate_thresholds, | |
| ) | |
| except Exception as exc: # pragma: no cover | |
| load_thresholds = None | |
| evaluate_thresholds = None | |
| THRESHOLD_IMPORT_ERROR = str(exc) | |
| else: | |
| THRESHOLD_IMPORT_ERROR = "" | |
| try: | |
| from huggingface_hub import InferenceClient | |
| except Exception: # pragma: no cover | |
| InferenceClient = None # type: ignore | |
| HF_ROUTER_REPO = os.environ.get("HF_ROUTER_REPO", "") | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| BENCH_GOLD_PATH = EVAL_DIR / "benchmarks" / "router_benchmark_hard.jsonl" | |
| THRESHOLDS_PATH = EVAL_DIR / "router_benchmark_thresholds.json" | |
| client = None | |
| if HF_ROUTER_REPO and InferenceClient is not None: | |
| try: | |
| client = InferenceClient(model=HF_ROUTER_REPO, token=HF_TOKEN) | |
| except Exception as exc: # pragma: no cover | |
| client = None | |
| ROUTER_LOAD_ERROR = str(exc) | |
| else: | |
| ROUTER_LOAD_ERROR = "" | |
| else: | |
| ROUTER_LOAD_ERROR = "InferenceClient unavailable or HF_ROUTER_REPO unset." | |
| SYSTEM_PROMPT = ( | |
| "You are the Router Agent coordinating Math, Code, and General-Search specialists.\n" | |
| "Emit ONLY strict JSON with keys route_plan, route_rationale, expected_artifacts,\n" | |
| "thinking_outline, handoff_plan, todo_list, difficulty, tags, acceptance_criteria, metrics." | |
| ) | |
| AGENT_LOAD_LOG: List[str] = [] | |
| def _load_module(module_name: str, file_path: Path): | |
| if not file_path.exists(): | |
| AGENT_LOAD_LOG.append(f"Missing module: {file_path}") | |
| return None | |
| spec = importlib.util.spec_from_file_location(module_name, file_path) | |
| if spec is None or spec.loader is None: | |
| AGENT_LOAD_LOG.append(f"Unable to load spec for {file_path}") | |
| return None | |
| module = importlib.util.module_from_spec(spec) | |
| try: | |
| spec.loader.exec_module(module) # type: ignore[attr-defined] | |
| except Exception as exc: | |
| AGENT_LOAD_LOG.append(f"Failed to import {file_path.name}: {exc}") | |
| return None | |
| return module | |
| M6_ROOT = REPO_ROOT / "Milestone-6" | |
| AGENT_BASE_PATH = M6_ROOT / "agents" / "base.py" | |
| BASE_MODULE = _load_module("router_agents_base", AGENT_BASE_PATH) | |
| if BASE_MODULE: | |
| AgentRequest = getattr(BASE_MODULE, "AgentRequest", None) | |
| AgentResult = getattr(BASE_MODULE, "AgentResult", None) | |
| else: | |
| AgentRequest = None | |
| AgentResult = None | |
| AGENT_LOAD_LOG.append("Agent base definitions unavailable; agent execution disabled.") | |
| class GeminiFallbackManager: | |
| """Fallback generator powered by Gemini 2.5 Pro (if configured).""" | |
| def __init__(self) -> None: | |
| self.available = False | |
| self.error: Optional[str] = None | |
| self.model = None | |
| self.model_name = os.environ.get("GEMINI_MODEL", "gemini-2.5-pro-exp-0801") | |
| api_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY") | |
| try: | |
| import google.generativeai as genai # type: ignore | |
| except Exception as exc: # pragma: no cover | |
| self.error = f"google-generativeai import failed: {exc}" | |
| AGENT_LOAD_LOG.append(f"Gemini fallback disabled: {self.error}") | |
| return | |
| if not api_key: | |
| self.error = "GOOGLE_API_KEY (or GEMINI_API_KEY) not set." | |
| AGENT_LOAD_LOG.append(f"Gemini fallback disabled: {self.error}") | |
| return | |
| try: | |
| genai.configure(api_key=api_key) | |
| self.model = genai.GenerativeModel(self.model_name) | |
| except Exception as exc: # pragma: no cover | |
| self.error = f"Failed to initialise Gemini model: {exc}" | |
| AGENT_LOAD_LOG.append(f"Gemini fallback disabled: {self.error}") | |
| return | |
| self.available = True | |
| AGENT_LOAD_LOG.append(f"Gemini fallback ready (model={self.model_name}).") | |
| def generate(self, tool_name: str, request: Any, error: Optional[str] = None) -> Any: | |
| if not self.available or self.model is None or AgentResult is None: | |
| raise RuntimeError("Gemini fallback not available.") | |
| if isinstance(request, dict): | |
| context = request.get("context") or {} | |
| step_instruction = request.get("user_query", "") | |
| else: | |
| context = getattr(request, "context", {}) or {} | |
| step_instruction = getattr(request, "user_query", "") | |
| original_query = context.get("original_query", "") | |
| prompt = ( | |
| f"You are the fallback specialist for router tool `{tool_name}`.\n" | |
| "Provide a thoughtful, self-contained response even when primary agents fail.\n" | |
| "Instructions:\n" | |
| "- Derive or explain any mathematics rigorously with step-by-step reasoning.\n" | |
| "- When code is required, output Python snippets and describe expected outputs; " | |
| "assume execution in a safe environment but do not fabricate results without caveats.\n" | |
| "- When internet search is needed, hypothesise likely high-quality sources and cite them " | |
| "as inline references (e.g., [search:keyword] or known publications).\n" | |
| "- Make assumptions explicit, and flag any gaps that require real execution or live search.\n" | |
| "- Return the final answer in Markdown.\n" | |
| ) | |
| prompt += f"\nOriginal user query:\n{original_query or 'N/A'}\n" | |
| prompt += f"\nCurrent routed instruction:\n{step_instruction}\n" | |
| if error: | |
| prompt += f"\nPrevious agent error: {error}\n" | |
| try: | |
| response = self.model.generate_content( | |
| prompt, | |
| generation_config={"temperature": 0.2, "top_p": 0.8}, | |
| ) | |
| text = getattr(response, "text", None) | |
| if text is None and hasattr(response, "candidates"): | |
| text = response.candidates[0].content.parts[0].text # type: ignore | |
| except Exception as exc: # pragma: no cover | |
| raise RuntimeError(f"Gemini fallback generation failed: {exc}") from exc | |
| if not text: | |
| text = "Fallback model did not return content." | |
| metrics = {"status": "fallback", "model": self.model_name} | |
| if error: | |
| metrics["upstream_error"] = error | |
| return AgentResult(content=text, metrics=metrics) | |
| fallback_manager = GeminiFallbackManager() | |
| def _load_agent_class( | |
| agent_name: str, | |
| primary_path: Path, | |
| primary_class: str, | |
| fallback_path: Optional[Path] = None, | |
| fallback_class: Optional[str] = None, | |
| ): | |
| module = _load_module(f"{agent_name}_primary", primary_path) | |
| if module and hasattr(module, primary_class): | |
| AGENT_LOAD_LOG.append(f"Loaded {primary_class} from {primary_path}") | |
| return getattr(module, primary_class) | |
| if fallback_path and fallback_class: | |
| fallback_module = _load_module(f"{agent_name}_fallback", fallback_path) | |
| if fallback_module and hasattr(fallback_module, fallback_class): | |
| AGENT_LOAD_LOG.append(f"Using fallback {fallback_class} for {agent_name}") | |
| return getattr(fallback_module, fallback_class) | |
| AGENT_LOAD_LOG.append(f"No implementation available for {agent_name}") | |
| return None | |
| AGENT_REGISTRY: Dict[str, Any] = {} | |
| def _register_agent(name: str, agent_obj: Any) -> None: | |
| AGENT_REGISTRY[name] = agent_obj | |
| if name.startswith("/"): | |
| AGENT_REGISTRY[name.lstrip("/")] = agent_obj | |
| else: | |
| AGENT_REGISTRY[f"/{name}"] = agent_obj | |
| if AgentRequest is not None and AgentResult is not None: | |
| # Math agent | |
| math_class = _load_agent_class( | |
| "math_agent", | |
| M6_ROOT / "math-agent" / "handler.py", | |
| "MathAgent", | |
| fallback_path=M6_ROOT / "math-agent" / "math_agent_template.py", | |
| fallback_class="TemplateMathAgent", | |
| ) | |
| # Code agent | |
| code_class = _load_agent_class( | |
| "code_agent", | |
| M6_ROOT / "code-agent" / "handler.py", | |
| "CodeAgent", | |
| ) | |
| # General-search agent | |
| general_class = _load_agent_class( | |
| "general_agent", | |
| M6_ROOT / "general-agent" / "handler.py", | |
| "GeneralSearchAgent", | |
| ) | |
| class _StubAgent: | |
| def __init__(self, tool_name: str, message: str): | |
| self.name = tool_name | |
| self._message = message | |
| def invoke(self, request: Any) -> Any: | |
| if fallback_manager.available: | |
| try: | |
| return fallback_manager.generate(self.name, request) | |
| except Exception as exc: # pragma: no cover | |
| AGENT_LOAD_LOG.append(f"Gemini fallback failed for {self.name}: {exc}") | |
| return AgentResult( | |
| content=self._message, | |
| metrics={"status": "stub", "tool": self.name}, | |
| ) | |
| if math_class is None: | |
| math_agent = _StubAgent("/math", "Math agent not yet implemented.") | |
| else: | |
| try: | |
| math_agent = math_class() | |
| except Exception as exc: | |
| AGENT_LOAD_LOG.append(f"MathAgent instantiation failed: {exc}") | |
| math_agent = _StubAgent("/math", f"Math agent load error: {exc}") | |
| _register_agent("/math", math_agent) | |
| if code_class is None: | |
| code_agent = _StubAgent("/code", "Code agent not yet implemented.") | |
| else: | |
| try: | |
| code_agent = code_class() | |
| except Exception as exc: | |
| AGENT_LOAD_LOG.append(f"CodeAgent instantiation failed: {exc}") | |
| code_agent = _StubAgent("/code", f"Code agent load error: {exc}") | |
| _register_agent("/code", code_agent) | |
| if general_class is None: | |
| general_agent = _StubAgent("/general-search", "General-search agent not yet implemented.") | |
| else: | |
| try: | |
| general_agent = general_class() | |
| except Exception as exc: | |
| AGENT_LOAD_LOG.append(f"GeneralSearchAgent instantiation failed: {exc}") | |
| general_agent = _StubAgent("/general-search", f"General agent load error: {exc}") | |
| _register_agent("/general-search", general_agent) | |
| else: | |
| AGENT_LOAD_LOG.append("AgentRequest/AgentResult undefined; skipping agent registry.") | |
| AGENT_STATUS_MARKDOWN = ( | |
| "\n".join(f"- {line}" for line in AGENT_LOAD_LOG) if AGENT_LOAD_LOG else "- Agent stubs loaded successfully." | |
| ) | |
| STARTUP_BENCHMARK_RESULT = run_startup_benchmark() | |
| def load_sample_plan() -> Dict[str, Any]: | |
| try: | |
| if BENCH_GOLD_PATH.exists(): | |
| first_line = BENCH_GOLD_PATH.read_text().splitlines()[0] | |
| record = json.loads(first_line) | |
| completion = json.loads(record["completion"]) | |
| return completion | |
| except Exception: | |
| pass | |
| # Fallback minimal example. | |
| return { | |
| "route_plan": [ | |
| "/general-search(query=\"site:arxiv.org meta-learning survey\", mode=web)", | |
| "/math(Outline a theoretical summary of Model-Agnostic Meta-Learning (MAML) and explain the inner/outer-loop updates.)", | |
| "/code(Implement a minimal MAML pseudo-code example to clarify the algorithm flow., using Python)", | |
| ], | |
| "route_rationale": ( | |
| "Search surfaces authoritative meta-learning references; " | |
| "math distills the theory; code converts the derivation into an executable sketch." | |
| ), | |
| "expected_artifacts": [ | |
| "Three bullet summary of seminal MAML papers.", | |
| "Equation block describing the meta-gradient.", | |
| "`maml_pseudocode.py` script with comments.", | |
| ], | |
| "thinking_outline": [ | |
| "1. Gather citations describing MAML.", | |
| "2. Express the loss formulation and gradient steps.", | |
| "3. Provide annotated pseudo-code for the inner/outer loop.", | |
| ], | |
| "handoff_plan": "/general-search -> /math -> /code -> router QA", | |
| "todo_list": [ | |
| "- [ ] /general-search: Collect recent survey or benchmark sources for MAML.", | |
| "- [ ] /math: Write the meta-objective and gradient derivation.", | |
| "- [ ] /code: Produce pseudo-code and comment on hyperparameters.", | |
| "- [ ] router QA: Ensure JSON schema compliance and cite sources.", | |
| ], | |
| "difficulty": "intermediate", | |
| "tags": ["meta-learning", "few-shot-learning"], | |
| "acceptance_criteria": [ | |
| "- Includes at least two citations to reputable sources.", | |
| "- Meta-gradient expression matches the pseudo-code implementation.", | |
| "- JSON validates against the router schema.", | |
| ], | |
| "metrics": { | |
| "primary": ["Route accuracy >= 0.8 on benchmark."], | |
| "secondary": ["Report token count and inference latency."], | |
| }, | |
| } | |
| SAMPLE_PLAN = load_sample_plan() | |
| TOOL_REGEX = re.compile(r"^\s*(/[a-zA-Z0-9_-]+)") | |
| def extract_json_from_text(raw_text: str) -> Dict[str, Any]: | |
| try: | |
| start = raw_text.index("{") | |
| end = raw_text.rfind("}") | |
| candidate = raw_text[start : end + 1] | |
| return json.loads(candidate) | |
| except Exception as exc: | |
| raise ValueError(f"Router output is not valid JSON: {exc}") from exc | |
| def call_router_model(user_query: str) -> Dict[str, Any]: | |
| if client is None: | |
| return SAMPLE_PLAN | |
| prompt = f"{SYSTEM_PROMPT}\n\nUser query:\n{user_query.strip()}\n" | |
| try: | |
| raw = client.text_generation( | |
| prompt, | |
| max_new_tokens=900, | |
| temperature=0.2, | |
| top_p=0.9, | |
| repetition_penalty=1.05, | |
| ) | |
| return extract_json_from_text(raw) | |
| except Exception as exc: # pragma: no cover | |
| return { | |
| "error": f"Router call failed ({exc}). Falling back to sample plan.", | |
| "sample_plan": SAMPLE_PLAN, | |
| } | |
| def generate_plan(user_query: str) -> Dict[str, Any]: | |
| if not user_query.strip(): | |
| raise gr.Error("Please provide a user query to route.") | |
| plan = call_router_model(user_query) | |
| return plan | |
| def generate_plan_and_store(user_query: str) -> tuple[Dict[str, Any], str]: | |
| plan = generate_plan(user_query) | |
| return plan, user_query | |
| def _resolve_plan_object(plan_input: Any) -> Optional[Dict[str, Any]]: | |
| plan_obj: Optional[Dict[str, Any]] | |
| if isinstance(plan_input, str): | |
| try: | |
| plan_obj = json.loads(plan_input) | |
| except json.JSONDecodeError: | |
| return None | |
| elif isinstance(plan_input, dict): | |
| plan_obj = plan_input | |
| else: | |
| return None | |
| if "route_plan" not in plan_obj and isinstance(plan_obj.get("sample_plan"), dict): | |
| plan_obj = plan_obj["sample_plan"] | |
| return plan_obj if isinstance(plan_obj, dict) else None | |
| def execute_plan(plan_input: Any, original_query: str) -> Dict[str, Any]: | |
| if AgentRequest is None or AgentResult is None: | |
| return {"success": False, "error": "Agent interfaces unavailable; cannot execute plan."} | |
| plan_obj = _resolve_plan_object(plan_input) | |
| if not plan_obj: | |
| return {"success": False, "error": "Plan must be valid JSON with a route_plan field."} | |
| route_plan = plan_obj.get("route_plan") | |
| if not isinstance(route_plan, list): | |
| return {"success": False, "error": "Plan is missing a route_plan list."} | |
| results: List[Dict[str, Any]] = [] | |
| for step_index, step in enumerate(route_plan): | |
| if not isinstance(step, str): | |
| results.append( | |
| { | |
| "step_index": step_index, | |
| "status": "invalid_step", | |
| "message": "Route step must be a string.", | |
| } | |
| ) | |
| continue | |
| match = TOOL_REGEX.match(step) | |
| tool_name = match.group(1) if match else "unknown" | |
| agent = AGENT_REGISTRY.get(tool_name) or AGENT_REGISTRY.get(tool_name.lstrip("/")) | |
| if agent is None: | |
| results.append( | |
| { | |
| "step_index": step_index, | |
| "tool": tool_name, | |
| "status": "skipped", | |
| "message": "No agent registered for this tool.", | |
| } | |
| ) | |
| continue | |
| request = AgentRequest( | |
| user_query=step, | |
| context={"original_query": original_query}, | |
| plan_metadata={"step_index": step_index, "raw_step": step}, | |
| ) | |
| try: | |
| agent_result = agent.invoke(request) | |
| except Exception as exc: | |
| if fallback_manager.available: | |
| try: | |
| agent_result = fallback_manager.generate(tool_name, request, error=str(exc)) | |
| except Exception as fallback_exc: # pragma: no cover | |
| results.append( | |
| { | |
| "step_index": step_index, | |
| "tool": tool_name, | |
| "status": "error", | |
| "message": f"{exc}; fallback failed: {fallback_exc}", | |
| } | |
| ) | |
| continue | |
| else: | |
| results.append( | |
| { | |
| "step_index": step_index, | |
| "tool": tool_name, | |
| "status": "error", | |
| "message": str(exc), | |
| } | |
| ) | |
| continue | |
| results.append( | |
| { | |
| "step_index": step_index, | |
| "tool": tool_name, | |
| "content": getattr(agent_result, "content", ""), | |
| "citations": getattr(agent_result, "citations", []), | |
| "artifacts": getattr(agent_result, "artifacts", []), | |
| "metrics": getattr(agent_result, "metrics", {}), | |
| } | |
| ) | |
| return {"success": True, "results": results} | |
| def run_startup_benchmark() -> Dict[str, Any]: | |
| if run_schema_evaluation is None or load_thresholds is None or evaluate_thresholds is None: | |
| return {"status": "unavailable", "message": "Benchmark utilities not available in this environment."} | |
| prediction_path = os.environ.get("ROUTER_BENCHMARK_PREDICTIONS") | |
| if not prediction_path: | |
| return {"status": "skipped", "message": "Set ROUTER_BENCHMARK_PREDICTIONS to auto-run benchmarks."} | |
| pred_path = Path(prediction_path) | |
| if not pred_path.exists(): | |
| return {"status": "error", "message": f"Predictions file not found: {pred_path}"} | |
| if not BENCH_GOLD_PATH.exists() or not THRESHOLDS_PATH.exists(): | |
| return {"status": "error", "message": "Benchmark gold or thresholds file missing."} | |
| try: | |
| schema_report = run_schema_evaluation( | |
| str(BENCH_GOLD_PATH), | |
| str(pred_path), | |
| max_error_examples=5, | |
| ) | |
| thresholds = load_thresholds(THRESHOLDS_PATH) | |
| threshold_results = evaluate_thresholds(schema_report["metrics"], thresholds) | |
| except Exception as exc: | |
| return {"status": "error", "message": f"Benchmark run failed: {exc}"} | |
| status = "pass" if threshold_results.get("overall_pass") else "fail" | |
| return { | |
| "status": status, | |
| "message": f"Benchmark {status.upper()} on startup.", | |
| "report": { | |
| "schema_report": schema_report, | |
| "threshold_results": threshold_results, | |
| }, | |
| "predictions_path": str(pred_path), | |
| } | |
| def compute_structural_metrics(plan: Dict[str, Any]) -> Dict[str, Any]: | |
| metrics: Dict[str, Any] = {} | |
| route_plan = plan.get("route_plan", []) | |
| if tool_sequence is not None and isinstance(route_plan, list): | |
| tools = tool_sequence(route_plan) | |
| todo_list = plan.get("todo_list", []) if isinstance(plan.get("todo_list"), list) else [] | |
| if todo_tool_alignment is not None: | |
| metrics["todo_tool_alignment"] = todo_tool_alignment(todo_list, tools) | |
| if todo_covers_all_tools is not None: | |
| metrics["todo_covers_all_tools"] = todo_covers_all_tools(todo_list, tools) | |
| handoff = plan.get("handoff_plan", "") | |
| metrics["handoff_mentions_all_tools"] = all( | |
| tool.lower() in (handoff or "").lower() for tool in tools | |
| ) | |
| metrics["expected_artifacts_count"] = len(plan.get("expected_artifacts", []) or []) | |
| metrics["acceptance_criteria_count"] = len(plan.get("acceptance_criteria", []) or []) | |
| return metrics | |
| def validate_plan(plan_input: Any) -> Dict[str, Any]: | |
| if isinstance(plan_input, str): | |
| try: | |
| plan = json.loads(plan_input) | |
| except json.JSONDecodeError as exc: | |
| return {"valid": False, "errors": [f"Invalid JSON: {exc}"]} | |
| else: | |
| plan = plan_input or {} | |
| errors = [] | |
| required_keys = [ | |
| "route_plan", | |
| "route_rationale", | |
| "expected_artifacts", | |
| "thinking_outline", | |
| "handoff_plan", | |
| "todo_list", | |
| "difficulty", | |
| "tags", | |
| "acceptance_criteria", | |
| "metrics", | |
| ] | |
| for key in required_keys: | |
| if key not in plan: | |
| errors.append(f"Missing required field: {key}") | |
| route_plan = plan.get("route_plan") | |
| if not isinstance(route_plan, list) or not route_plan: | |
| errors.append("route_plan must be a non-empty list of tool invocations.") | |
| else: | |
| for step in route_plan: | |
| if not isinstance(step, str): | |
| errors.append("Each route_plan entry must be a string.") | |
| break | |
| todo_list = plan.get("todo_list") | |
| if todo_list is not None and not isinstance(todo_list, list): | |
| errors.append("todo_list must be a list of strings.") | |
| metrics_block = plan.get("metrics") | |
| if metrics_block is not None and not isinstance(metrics_block, dict): | |
| errors.append("metrics must be a dictionary with primary/secondary lists.") | |
| structural = compute_structural_metrics(plan) | |
| return { | |
| "valid": len(errors) == 0, | |
| "errors": errors, | |
| "structural_metrics": structural, | |
| "tool_count": len(route_plan) if isinstance(route_plan, list) else 0, | |
| } | |
| def benchmark_predictions(pred_file: Any) -> Dict[str, Any]: | |
| if run_schema_evaluation is None or load_thresholds is None or evaluate_thresholds is None: | |
| return { | |
| "success": False, | |
| "error": "Benchmark utilities are unavailable.", | |
| "schema_import_error": SCHEMA_IMPORT_ERROR, | |
| "threshold_import_error": THRESHOLD_IMPORT_ERROR, | |
| } | |
| if not BENCH_GOLD_PATH.exists(): | |
| return { | |
| "success": False, | |
| "error": f"Benchmark gold file missing: {BENCH_GOLD_PATH}", | |
| } | |
| if not THRESHOLDS_PATH.exists(): | |
| return { | |
| "success": False, | |
| "error": f"Thresholds file missing: {THRESHOLDS_PATH}", | |
| } | |
| if pred_file is None: | |
| return {"success": False, "error": "Upload a .jsonl predictions file first."} | |
| if hasattr(pred_file, "name"): | |
| pred_path = Path(pred_file.name) | |
| elif isinstance(pred_file, str): | |
| pred_path = Path(pred_file) | |
| else: | |
| # Save uploaded bytes to a temp file. | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".jsonl") as tmp: | |
| tmp.write(pred_file.read()) | |
| pred_path = Path(tmp.name) | |
| try: | |
| schema_report = run_schema_evaluation( | |
| str(BENCH_GOLD_PATH), | |
| str(pred_path), | |
| max_error_examples=10, | |
| ) | |
| except Exception as exc: | |
| return {"success": False, "error": f"Schema evaluation failed: {exc}"} | |
| try: | |
| thresholds = load_thresholds(THRESHOLDS_PATH) | |
| threshold_results = evaluate_thresholds(schema_report["metrics"], thresholds) | |
| except Exception as exc: | |
| return {"success": False, "error": f"Threshold comparison failed: {exc}"} | |
| return { | |
| "success": True, | |
| "overall_pass": threshold_results.get("overall_pass"), | |
| "schema_metrics": schema_report["metrics"], | |
| "threshold_results": threshold_results, | |
| "error_samples": schema_report.get("error_samples", []), | |
| } | |
| def describe_router_backend() -> str: | |
| if client is None: | |
| return f"Router backend not initialised. {ROUTER_LOAD_ERROR}" | |
| return f"Using Hugging Face Inference endpoint: `{HF_ROUTER_REPO}`" | |
| with gr.Blocks(title="CourseGPT Router Control Room") as demo: | |
| gr.Markdown( | |
| "## CourseGPT Router Control Room\n" | |
| "Milestone 6 deployment scaffold for the router agent. Populate the router model " | |
| "environment variables to enable live inference, or rely on the bundled sample plan." | |
| ) | |
| gr.Markdown(f"**Backend status:** {describe_router_backend()}") | |
| with gr.Tab("Router Planner"): | |
| user_query_state = gr.State("") | |
| user_query = gr.Textbox( | |
| label="User query", | |
| lines=8, | |
| placeholder="Describe the task that needs routing...", | |
| ) | |
| generate_btn = gr.Button("Generate plan", variant="primary") | |
| plan_output = gr.JSON(label="Router plan") | |
| generate_btn.click( | |
| fn=generate_plan_and_store, | |
| inputs=user_query, | |
| outputs=[plan_output, user_query_state], | |
| ) | |
| validate_btn = gr.Button("Run structural checks") | |
| validation_output = gr.JSON(label="Validation summary") | |
| validate_btn.click(fn=validate_plan, inputs=plan_output, outputs=validation_output) | |
| execute_btn = gr.Button("Simulate agent execution") | |
| execution_output = gr.JSON(label="Agent execution log") | |
| execute_btn.click( | |
| fn=execute_plan, | |
| inputs=[plan_output, user_query_state], | |
| outputs=execution_output, | |
| ) | |
| with gr.Tab("Benchmark"): | |
| gr.Markdown( | |
| "Upload a JSONL file of router predictions (one JSON object per line). " | |
| "The file must align with the `router_benchmark_hard.jsonl` gold split." | |
| ) | |
| startup_status = STARTUP_BENCHMARK_RESULT.get("message", "Benchmark not run.") | |
| gr.Markdown(f"**Startup benchmark status:** {startup_status}") | |
| if STARTUP_BENCHMARK_RESULT.get("report"): | |
| gr.JSON( | |
| value=STARTUP_BENCHMARK_RESULT["report"], | |
| label="Startup benchmark report", | |
| ) | |
| predictions_file = gr.File(label="Predictions (.jsonl)", file_types=[".jsonl"]) | |
| benchmark_btn = gr.Button("Evaluate against thresholds", variant="primary") | |
| benchmark_output = gr.JSON(label="Benchmark report") | |
| benchmark_btn.click(fn=benchmark_predictions, inputs=predictions_file, outputs=benchmark_output) | |
| with gr.Tab("Docs & TODO"): | |
| gr.Markdown( | |
| "- Populate `/math`, `/code`, `/general-search` agent hooks for live orchestration.\n" | |
| "- Add citations and latency logging once the production router is connected.\n" | |
| "- Link to Milestone 5 benchmark reports and final project documentation." | |
| ) | |
| gr.Markdown("**Agent load summary:**\n" + AGENT_STATUS_MARKDOWN) | |
| demo.queue() | |
| if __name__ == "__main__": # pragma: no cover | |
| demo.launch() | |