Spaces:

crazycrazypete
/

Masters-four-Tab-OpenAI

Running

App Files Files Community

Pete Dunn commited on Mar 29

Commit

ddae266

1 Parent(s): 640f404

Harden router canary eval preflight and survey runs

Browse files

Files changed (12) hide show

.github/workflows/deploy-hf-gated.yml +2 -0
backend/app/knowledgebase/core.py +247 -18
backend/app/main.py +25 -2
backend/app/test_rapid_router_catalog_bootstrap.py +24 -0
backend/app/test_score_router_canary_ab_responses.py +82 -0
backend/app/test_unified_kb_core.py +95 -0
backend/app/test_unified_kb_router_workbook.py +91 -0
backend/app/test_validate_hosted_runtime.py +124 -1
backend/scripts/run_router_canary_ab_eval_shard.py +657 -0
backend/scripts/score_router_canary_ab_responses.py +363 -0
backend/scripts/validate_hosted_runtime.py +42 -0
frontend/scripts/run-hosted-smoke.sh +9 -0

.github/workflows/deploy-hf-gated.yml CHANGED Viewed

@@ -379,6 +379,7 @@ jobs:
             --expected-git-sha "${EXPECTED_SHA}" \
             --expect-auth-required true \
             --expect-auth-enabled true \
             --out "docs/evals/canary_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"
   deploy-production:
@@ -514,4 +515,5 @@ jobs:
             --expected-git-sha "${EXPECTED_SHA}" \
             --expect-auth-required true \
             --expect-auth-enabled true \
             --out "docs/evals/production_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"

             --expected-git-sha "${EXPECTED_SHA}" \
             --expect-auth-required true \
             --expect-auth-enabled true \
+            --require-router-workbook-loaded true \
             --out "docs/evals/canary_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"
   deploy-production:
             --expected-git-sha "${EXPECTED_SHA}" \
             --expect-auth-required true \
             --expect-auth-enabled true \
+            --require-router-workbook-loaded true \
             --out "docs/evals/production_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"

backend/app/knowledgebase/core.py CHANGED Viewed

@@ -1695,6 +1695,52 @@ def _looks_like_router_lifecycle(message: str) -> bool:
     return False
 def _looks_like_pots(message: str) -> bool:
     low = _normalize_router_query_text(message)
     if _contains_any(low, _POTS_HINTS):
@@ -8811,8 +8857,20 @@ class UnifiedKnowledgebaseCore:
                 continue
             seen_requested_compacts.add(compact)
             requested_compare_labels.append(label)
         asks_install_caveats = any(h in low for h in ("install caveat", "install caveats"))
         if asks_install_caveats:
             compare_field_labels = {
                 "wan_lan": "WAN/LAN ports",
                 "antennas_rf": "RF connectors",
@@ -8842,13 +8900,108 @@ class UnifiedKnowledgebaseCore:
             elif "install_caveats" not in requested_compare_fields:
                 requested_compare_fields.append("install_caveats")
             def _catalog_compare_value(row: Dict[str, Any], field: str) -> str:
                 raw = _norm(row.get(field, ""))
                 if (not raw) and field == "install_caveats":
                     raw = _norm(row.get("special notes", ""))
-                if not raw:
-                    return "Not clearly documented"
-                return _truncate(_fix_common_mojibake(raw), 170)
             def _catalog_install_compare_table() -> Optional[Dict[str, Any]]:
                 doc_rows: List[Dict[str, str]] = []
@@ -9140,14 +9293,6 @@ class UnifiedKnowledgebaseCore:
             if catalog_table is not None:
                 return catalog_table
             return None
-        wants_doc_matrix = (
-            ("documented" in low and "not documented" in low)
-            or ("include what is documented" in low)
-            or ("what is documented vs not documented" in low)
-        )
-        wants_docs_only = wants_doc_matrix or any(
-            h in low for h in ("from documented specs only", "documented specs only", "from docs only", "docs only")
-        )
         deterministic_doc_matrix_supported = bool(
             wants_doc_matrix
             and len(dedup_models) >= 2
@@ -9185,6 +9330,31 @@ class UnifiedKnowledgebaseCore:
             if not value:
                 return "Not clearly documented"
             low_value = value.lower()
             if any(token in low_value for token in ("not listed", "abstained", "unknown", "csv conflict", "(blank)")):
                 return "Not clearly documented"
             if field_name == "wan_lan":
@@ -9206,6 +9376,8 @@ class UnifiedKnowledgebaseCore:
                     return "Not clearly documented"
                 return _truncate(re.sub(r"\s+", " ", value).strip(), 140)
             if field_name == "antennas_rf":
                 if "rf:" in value.lower():
                     value = value.split("RF:", 1)[-1]
                 connector_match = re.search(
@@ -9237,6 +9409,8 @@ class UnifiedKnowledgebaseCore:
                     return "Not clearly documented"
                 return _truncate(value, 120)
             if field_name == "modem":
                 modem_match = re.search(
                     r"\b(?:5g(?:\s*nr)?(?:\s*sa)?(?:\s*/\s*nsa)?|4g(?:\s*lte)?(?:\s*cat\s*\d+)?|lte\s*cat\s*\d+)\b[^.;,]{0,50}",
                     value,
@@ -12488,6 +12662,51 @@ class UnifiedKnowledgebaseCore:
                 return "Listed, but family not explicit"
             return "Needs connector validation"
         def _vehicle_compare_reason(row: Dict[str, Any]) -> str:
             use_case = _norm(row.get("primary_use_case", ""))
             rugged = _norm(row.get("ruggedization", ""))
@@ -12663,7 +12882,7 @@ class UnifiedKnowledgebaseCore:
                 sources: List[Dict[str, Any]] = []
                 for idx, (display, row, antenna_family, why_fit) in enumerate(compare_rows, start=1):
                     wan_lan = _norm(row.get("wan_lan", "")) or "Not listed"
-                    rf = _norm(row.get("antennas_rf", "")) or "Not listed"
                     lines.append(
                         f"| {_md_cell(display)} | {_md_cell(_norm(row.get('primary_use_case', '')) or 'Vehicle/mobile signal reviewed')} "
                         f"| {_md_cell(wan_lan)} | {_md_cell(rf)} | {_md_cell(antenna_family)} | {_md_cell(why_fit)} |"
@@ -12810,7 +13029,7 @@ class UnifiedKnowledgebaseCore:
             modem = _norm(row.get("modem", "")) or "Not listed (abstained)"
             rugged = _norm(row.get("ruggedization", "")) or "Not listed (abstained)"
             battery = _norm(row.get("battery", "")) or "Not listed (abstained)"
-            rf = _norm(row.get("antennas_rf", "")) or "Not listed (abstained)"
             lines.append(
                 f"| {idx} | {_md_cell(model_name)} | {_md_cell(modem)} | {_md_cell(rugged)} | "
                 f"{_md_cell(battery)} | {_md_cell(rf)} | {_md_cell(note)} |"
@@ -29329,9 +29548,19 @@ class UnifiedKnowledgebaseCore:
             ]
             ranking_rows: List[Tuple[int, int, str]] = []
             def _fleet_priority(item: Dict[str, Any], matched: Dict[str, Any]) -> Tuple[int, str]:
                 qty = int(item.get("qty") or 0)
-                router_name = str(item.get("model_display") or item.get("product_text") or "Unknown")
                 if not matched:
                     return 150 + min(qty, 25), (
                         f"`{router_name}` needs an exact workbook model match before I can rank its migration path safely."
@@ -29373,7 +29602,7 @@ class UnifiedKnowledgebaseCore:
                     reasons.append("authoritative lifecycle dates are incomplete")
                 reason_text = ", and ".join(reasons[:3])
-                return score, f"`{_display_name(match) or router_name}` should be prioritized because {reason_text}."
             asks_ranked_output = any(
                 token in query.normalized_message
@@ -29393,14 +29622,14 @@ class UnifiedKnowledgebaseCore:
                     status = _status_label(match, lifecycle)
                     eos = lifecycle.get("end_of_sale_date") or "Not listed"
                     eol = lifecycle.get("last_support_date") or "Not listed"
-                    router_name = _display_name(match) or str(item.get("model_display") or item.get("product_text") or "Unknown")
                 else:
                     same_brand = "Needs exact workbook match"
                     backup = "Needs exact workbook match"
                     status = "Needs exact workbook match"
                     eos = "Not listed"
                     eol = "Not listed"
-                    router_name = str(item.get("model_display") or item.get("product_text") or "Unknown")
                 rows.append(
                     "| "
                     + " | ".join(
@@ -31813,7 +32042,7 @@ class UnifiedKnowledgebaseCore:
             blocked["meta"] = blocked_meta
             return blocked
         lifecycle_policy_text = _scrub_router_model_tokens_for_policy(msg)
-        if _EXACT_LIFECYCLE_RE.search(lifecycle_policy_text):
             blocked = self._policy_block_response("exact_lifecycle", st)
             blocked_meta = _as_dict(blocked.get("meta"))
             blocked_meta["timing_ms"] = {"total": round((time.perf_counter() - t_total) * 1000.0, 2)}

     return False
+def _is_supported_router_mixed_lifecycle_request(message: str) -> bool:
+    normalized = _normalize_router_query_text(message)
+    query = parse_router_intelligence_query(message)
+    if query is None or query.intent not in {"details", "compare"}:
+        return False
+    exact_date_tokens = (
+        "exact lifecycle date",
+        "lifecycle date",
+        "end of sale date",
+        "end of life date",
+        "eos date",
+        "eol date",
+        "what exact date",
+    )
+    if any(token in normalized for token in exact_date_tokens):
+        return False
+    safe_field_tokens = (
+        "primary use case",
+        "use case",
+        "wan/lan",
+        "wan",
+        "lan",
+        "ethernet",
+        "ports",
+        "wi-fi",
+        "wifi",
+        "modem",
+        "rf connector",
+        "rf connectors",
+        "connectors",
+        "install caveat",
+        "install caveats",
+        "current recommendation",
+        "still current recommendation",
+        "status",
+        "documented vs",
+        "not documented",
+        "compare",
+        "difference",
+        "different",
+        "versus",
+        " vs ",
+    )
+    return any(token in normalized for token in safe_field_tokens)
 def _looks_like_pots(message: str) -> bool:
     low = _normalize_router_query_text(message)
     if _contains_any(low, _POTS_HINTS):
                 continue
             seen_requested_compacts.add(compact)
             requested_compare_labels.append(label)
+        wants_doc_matrix = (
+            ("documented" in low and "not documented" in low)
+            or ("include what is documented" in low)
+            or ("what is documented vs not documented" in low)
+        )
+        wants_docs_only = wants_doc_matrix or any(
+            h in low for h in ("from documented specs only", "documented specs only", "from docs only", "docs only")
+        )
         asks_install_caveats = any(h in low for h in ("install caveat", "install caveats"))
         if asks_install_caveats:
+            # This helper only runs in the router_docs lane, so install-caveat
+            # compare prompts should still stay source-bounded even when the
+            # user omits an explicit "docs only" phrase.
+            wants_docs_only = True
             compare_field_labels = {
                 "wan_lan": "WAN/LAN ports",
                 "antennas_rf": "RF connectors",
             elif "install_caveats" not in requested_compare_fields:
                 requested_compare_fields.append("install_caveats")
+            variant_ambiguity_markers = (
+                "vary by sku",
+                "varies by sku",
+                "depends on package",
+                "depends on package or accessory",
+                "exact modem depends",
+                "package or accessory",
+                "supported modem option",
+                "supported modem options",
+                "supported modem sku",
+                "supported modem skus",
+                "modem-equipped variant",
+                "modem-equipped variants",
+                "no-modem",
+            )
+            def _has_variant_ambiguity(value: str) -> bool:
+                low_value = value.lower()
+                return any(marker in low_value for marker in variant_ambiguity_markers)
+            def _sanitize_catalog_install_compare_value(field: str, raw_value: Any) -> str:
+                value = _fix_common_mojibake(_norm(raw_value))
+                if not value:
+                    return "Not clearly documented"
+                low_value = value.lower()
+                if field == "wan_lan":
+                    port_match = re.search(
+                        r"\bfive ethernet ports?\b|\bdual ethernet ports?\b|\b\d+\s*(?:x|\*)\s*(?:10/100/1000|10/100|100mbit/s|100mbps|1gbe|2\.5gbe|ge)\s*(?:rj45\s*)?(?:ethernet|network)\s*ports?\b(?:[^.;]{0,40}\b(?:wan|lan|vlan)\b[^.;]{0,40})?",
+                        value,
+                        flags=re.IGNORECASE,
+                    )
+                    if port_match:
+                        value = port_match.group(0)
+                        low_value = value.lower()
+                    has_port_signal = bool(
+                        re.search(r"\b(?:wan|lan|ethernet|rj45|sfp\+?)\b", low_value)
+                        or re.search(r"\b\d+\s*x\b", low_value)
+                        or re.search(r"\b\d+(?:\.\d+)?\s*(?:gbe?|mbps|gbps)\b", low_value)
+                        or re.search(r"\b(single|dual|triple|quad|five)\b", low_value)
+                    )
+                    if not has_port_signal:
+                        return "Not clearly documented"
+                    return _truncate(re.sub(r"\s+", " ", value).strip(), 140)
+                if field == "antennas_rf":
+                    if _has_variant_ambiguity(value):
+                        return "Needs exact SKU/package; connector path varies across documented family variants."
+                    connector_match = re.search(
+                        r"(?:\d+\s*x\s*)?(?:rp-?sma|sma)\b[^.;]{0,80}|antenna connectors?[^.;]{0,100}|(?:gps|gnss)\b[^.;]{0,80}",
+                        value,
+                        flags=re.IGNORECASE,
+                    )
+                    if not connector_match:
+                        return "Not clearly documented"
+                    value = connector_match.group(0)
+                    value = re.sub(r"^(?:both|external|internal)\s*\([^)]*\)\s*;?\s*", "", value, flags=re.IGNORECASE)
+                    value = re.sub(r"\([^)]*(?:depends on|if present|verify [^)]+|see [^)]+ docs)[^)]*\)", "", value, flags=re.IGNORECASE)
+                    value = re.sub(r"[.;]\s*Adapter pigtails?:[^.;]*", "", value, flags=re.IGNORECASE)
+                    value = re.sub(r"[.;]\s*connectors likely[^.;]*", "", value, flags=re.IGNORECASE)
+                    value = re.sub(r"[.;]\s*verify connector type[^.;]*", "", value, flags=re.IGNORECASE)
+                    value = re.sub(r"[.;]\s*(SIM|Ethernet):[^.;]*", "", value, flags=re.IGNORECASE)
+                    value = re.sub(r"\s+", " ", value).strip(" ;.")
+                    low_value = value.lower()
+                    if (not value) or (not any(token in low_value for token in ("sma", "rp-sma", "connector", "gnss", "gps"))):
+                        return "Not clearly documented"
+                    return _truncate(value, 140)
+                if field == "modem":
+                    if _has_variant_ambiguity(value):
+                        return "Needs exact SKU/package; modem path varies across documented family variants."
+                    modem_match = re.search(
+                        r"\b(?:5g(?:\s*nr)?(?:\s*sa)?(?:\s*/\s*nsa)?|4g(?:\s*lte)?(?:\s*cat\s*\d+)?|lte\s*cat\s*\d+)\b[^.;,]{0,50}",
+                        value,
+                        flags=re.IGNORECASE,
+                    )
+                    if not modem_match:
+                        return "Not clearly documented"
+                    value = modem_match.group(0)
+                    low_value = value.lower()
+                    if low_value in {"modem", "modems", "cellular", "cellular modem"}:
+                        return "Not clearly documented"
+                    if any(token in low_value for token in ("ethernet", "wan", "lan", "cloud management", "ports", "wi-fi", "vpn", "connectivity", "secure")):
+                        return "Not clearly documented"
+                    return _truncate(value, 120)
+                if field == "wifi":
+                    if any(token in low_value for token in ("none", "no wi-fi", "no wifi", "without wi-fi", "without wifi")):
+                        return "None"
+                    if ("802.11" not in low_value) and (not re.search(r"\bwi-?fi\s*[4567]\b", low_value)):
+                        return "Not clearly documented"
+                    return _truncate(value, 120)
+                if field == "battery":
+                    if low_value == "none":
+                        return "None"
+                    return _truncate(value, 120)
+                if field == "install_caveats":
+                    if _has_variant_ambiguity(value) or any(token in low_value for token in ("verify exact", "exact sku", "exact modem", "exact package")):
+                        return "Verify exact SKU/package before finalizing modem, RF, or accessory assumptions."
+                return _truncate(value, 170)
             def _catalog_compare_value(row: Dict[str, Any], field: str) -> str:
                 raw = _norm(row.get(field, ""))
                 if (not raw) and field == "install_caveats":
                     raw = _norm(row.get("special notes", ""))
+                return _sanitize_catalog_install_compare_value(field, raw)
             def _catalog_install_compare_table() -> Optional[Dict[str, Any]]:
                 doc_rows: List[Dict[str, str]] = []
             if catalog_table is not None:
                 return catalog_table
             return None
         deterministic_doc_matrix_supported = bool(
             wants_doc_matrix
             and len(dedup_models) >= 2
             if not value:
                 return "Not clearly documented"
             low_value = value.lower()
+            variant_ambiguity_markers = (
+                "variant",
+                "variants",
+                "exact sku",
+                "exact package",
+                "exact modem",
+                "modem option",
+                "modem options",
+                "sku/package",
+                "sku package",
+                "depends on sku",
+                "depends on package",
+                "see model-specific docs",
+                "see exact sku docs",
+                "supported modem sku",
+                "supported modem skus",
+                "modem-equipped variant",
+                "modem-equipped variants",
+                "no-modem",
+            )
+            def _has_variant_ambiguity_local(text: str) -> bool:
+                local_low = text.lower()
+                return any(marker in local_low for marker in variant_ambiguity_markers)
             if any(token in low_value for token in ("not listed", "abstained", "unknown", "csv conflict", "(blank)")):
                 return "Not clearly documented"
             if field_name == "wan_lan":
                     return "Not clearly documented"
                 return _truncate(re.sub(r"\s+", " ", value).strip(), 140)
             if field_name == "antennas_rf":
+                if _has_variant_ambiguity_local(value):
+                    return "Needs exact SKU/package; connector path varies across documented family variants."
                 if "rf:" in value.lower():
                     value = value.split("RF:", 1)[-1]
                 connector_match = re.search(
                     return "Not clearly documented"
                 return _truncate(value, 120)
             if field_name == "modem":
+                if _has_variant_ambiguity_local(value):
+                    return "Needs exact SKU/package; modem path varies across documented family variants."
                 modem_match = re.search(
                     r"\b(?:5g(?:\s*nr)?(?:\s*sa)?(?:\s*/\s*nsa)?|4g(?:\s*lte)?(?:\s*cat\s*\d+)?|lte\s*cat\s*\d+)\b[^.;,]{0,50}",
                     value,
                 return "Listed, but family not explicit"
             return "Needs connector validation"
+        def _vehicle_compare_connector_summary(row: Dict[str, Any]) -> str:
+            value = _fix_common_mojibake(_norm(row.get("antennas_rf", "")))
+            if not value:
+                return "Not clearly documented"
+            low_value = value.lower()
+            clauses: List[str] = []
+            def _remember(text: str) -> None:
+                cleaned = re.sub(r"\s+", " ", _norm(text)).strip(" ;,.")
+                if not cleaned:
+                    return
+                if cleaned.lower() in {item.lower() for item in clauses}:
+                    return
+                clauses.append(cleaned)
+            for match in re.finditer(
+                r"\b\d+\s*x\s*(?:sma|rp-?sma)\s*(?:cellular|wi-?fi|gnss|gps)?(?:\s+connectors?)?\b",
+                value,
+                flags=re.IGNORECASE,
+            ):
+                _remember(match.group(0))
+            for match in re.finditer(
+                r"\b(?:external\s+)?(?:cellular\s+sma connectors?|reverse-?sma wi-?fi connectors?|sma rf connectors?|wi-?fi variant uses rp-?sma)\b",
+                value,
+                flags=re.IGNORECASE,
+            ):
+                _remember(match.group(0))
+            for group in re.findall(r"\(([^)]*(?:sma|rp-?sma|gps|gnss)[^)]*)\)", value, flags=re.IGNORECASE):
+                for part in re.split(r"[;,]", group):
+                    if any(token in part.lower() for token in ("typical", "adapter", "pigtail")):
+                        continue
+                    if re.search(r"\b(?:sma|rp-?sma|gps|gnss)\b", part, flags=re.IGNORECASE):
+                        _remember(part)
+            if not clauses and any(token in low_value for token in ("sma", "rp-sma", "gps", "gnss", "connector")):
+                truncated = re.split(r"[.;]\s*Adapter pigtails?:", value, maxsplit=1, flags=re.IGNORECASE)[0]
+                truncated = re.sub(r"\bCellular:\s*4x4 MIMO on SMA\b", "", truncated, flags=re.IGNORECASE)
+                truncated = re.sub(r"\bWi-?Fi(?:\s*\(if present\))?\s+on\s+RP-SMA\b", "", truncated, flags=re.IGNORECASE)
+                truncated = re.sub(r"\bGNSS on SMA\b", "", truncated, flags=re.IGNORECASE)
+                truncated = re.sub(r"\s+", " ", truncated).strip(" ;,.")
+                if truncated:
+                    _remember(truncated)
+            if not clauses:
+                return "Not clearly documented"
+            return _truncate("; ".join(clauses), 140)
         def _vehicle_compare_reason(row: Dict[str, Any]) -> str:
             use_case = _norm(row.get("primary_use_case", ""))
             rugged = _norm(row.get("ruggedization", ""))
                 sources: List[Dict[str, Any]] = []
                 for idx, (display, row, antenna_family, why_fit) in enumerate(compare_rows, start=1):
                     wan_lan = _norm(row.get("wan_lan", "")) or "Not listed"
+                    rf = _vehicle_compare_connector_summary(row)
                     lines.append(
                         f"| {_md_cell(display)} | {_md_cell(_norm(row.get('primary_use_case', '')) or 'Vehicle/mobile signal reviewed')} "
                         f"| {_md_cell(wan_lan)} | {_md_cell(rf)} | {_md_cell(antenna_family)} | {_md_cell(why_fit)} |"
             modem = _norm(row.get("modem", "")) or "Not listed (abstained)"
             rugged = _norm(row.get("ruggedization", "")) or "Not listed (abstained)"
             battery = _norm(row.get("battery", "")) or "Not listed (abstained)"
+            rf = _vehicle_compare_connector_summary(row)
             lines.append(
                 f"| {idx} | {_md_cell(model_name)} | {_md_cell(modem)} | {_md_cell(rugged)} | "
                 f"{_md_cell(battery)} | {_md_cell(rf)} | {_md_cell(note)} |"
             ]
             ranking_rows: List[Tuple[int, int, str]] = []
+            def _fleet_router_name(item: Dict[str, Any], matched: Dict[str, Any]) -> str:
+                requested_label = _norm(item.get("model_display") or item.get("product_text") or "")
+                match = _as_dict(matched.get("match"))
+                product_key = _norm(match.get("product_key") or "")
+                if requested_label and product_key:
+                    if _compact_model(requested_label) == _compact_model(product_key):
+                        return requested_label
+                    return product_key
+                return product_key or requested_label or _display_name(match) or "Unknown"
             def _fleet_priority(item: Dict[str, Any], matched: Dict[str, Any]) -> Tuple[int, str]:
                 qty = int(item.get("qty") or 0)
+                router_name = _fleet_router_name(item, matched)
                 if not matched:
                     return 150 + min(qty, 25), (
                         f"`{router_name}` needs an exact workbook model match before I can rank its migration path safely."
                     reasons.append("authoritative lifecycle dates are incomplete")
                 reason_text = ", and ".join(reasons[:3])
+                return score, f"`{router_name}` should be prioritized because {reason_text}."
             asks_ranked_output = any(
                 token in query.normalized_message
                     status = _status_label(match, lifecycle)
                     eos = lifecycle.get("end_of_sale_date") or "Not listed"
                     eol = lifecycle.get("last_support_date") or "Not listed"
+                    router_name = _fleet_router_name(item, matched)
                 else:
                     same_brand = "Needs exact workbook match"
                     backup = "Needs exact workbook match"
                     status = "Needs exact workbook match"
                     eos = "Not listed"
                     eol = "Not listed"
+                    router_name = _fleet_router_name(item, matched)
                 rows.append(
                     "| "
                     + " | ".join(
             blocked["meta"] = blocked_meta
             return blocked
         lifecycle_policy_text = _scrub_router_model_tokens_for_policy(msg)
+        if _EXACT_LIFECYCLE_RE.search(lifecycle_policy_text) and (not _is_supported_router_mixed_lifecycle_request(msg)):
             blocked = self._policy_block_response("exact_lifecycle", st)
             blocked_meta = _as_dict(blocked.get("meta"))
             blocked_meta["timing_ms"] = {"total": round((time.perf_counter() - t_total) * 1000.0, 2)}

backend/app/main.py CHANGED Viewed

@@ -1706,16 +1706,19 @@ _RAPID_ROUTER_CATALOG_WORKBOOK_PATTERNS: Tuple[str, ...] = (
 def _resolve_rapid_router_catalog_workbook_path() -> Optional[Path]:
-    search_roots: List[Path] = [
         Path.cwd(),
         Path.cwd() / "backend",
         Path.home() / "Downloads",
         Path.home() / "Library" / "CloudStorage" / "Dropbox" / "Mac" / "Downloads",
         Path("/data"),
         Path("/tmp"),
     ]
     seen: set[str] = set()
-    for root in search_roots:
         root_key = str(root)
         if root_key in seen or not root.exists() or not root.is_dir():
             continue
@@ -1724,6 +1727,26 @@ def _resolve_rapid_router_catalog_workbook_path() -> Optional[Path]:
             for candidate in sorted(root.glob(pattern)):
                 if candidate.is_file():
                     return candidate
     return None

 def _resolve_rapid_router_catalog_workbook_path() -> Optional[Path]:
+    direct_search_roots: List[Path] = [
         Path.cwd(),
         Path.cwd() / "backend",
+        Path.cwd() / "backend" / "data",
         Path.home() / "Downloads",
         Path.home() / "Library" / "CloudStorage" / "Dropbox" / "Mac" / "Downloads",
         Path("/data"),
+        Path("/data/rapid_router"),
         Path("/tmp"),
+        _resolve_rapid_router_storage_dir(),
     ]
     seen: set[str] = set()
+    for root in direct_search_roots:
         root_key = str(root)
         if root_key in seen or not root.exists() or not root.is_dir():
             continue
             for candidate in sorted(root.glob(pattern)):
                 if candidate.is_file():
                     return candidate
+    recursive_search_roots: List[Path] = [
+        Path.cwd() / "backend" / "data",
+        Path("/data"),
+        Path("/data/rapid_router"),
+        _resolve_rapid_router_storage_dir(),
+        Path("/tmp"),
+    ]
+    for root in recursive_search_roots:
+        root_key = f"recursive:{root}"
+        if root_key in seen or not root.exists() or not root.is_dir():
+            continue
+        seen.add(root_key)
+        for pattern in _RAPID_ROUTER_CATALOG_WORKBOOK_PATTERNS:
+            for candidate in sorted(root.rglob(pattern)):
+                try:
+                    depth = len(candidate.relative_to(root).parts)
+                except Exception:
+                    depth = 999
+                if candidate.is_file() and depth <= 4:
+                    return candidate
     return None

backend/app/test_rapid_router_catalog_bootstrap.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from __future__ import annotations
+from pathlib import Path
+import app.main as main
+def test_resolve_rapid_router_catalog_workbook_path_finds_nested_backend_data_workbook(
+    tmp_path: Path, monkeypatch
+) -> None:
+    nested = tmp_path / "backend" / "data" / "rapid_router" / "imports"
+    nested.mkdir(parents=True)
+    workbook = nested / "device_master_source_of_truth_v26_site_survey_integrated_export.xlsx"
+    workbook.write_bytes(b"placeholder")
+    fake_home = tmp_path / "fake_home"
+    (fake_home / "Downloads").mkdir(parents=True)
+    monkeypatch.chdir(tmp_path)
+    monkeypatch.setattr(main, "_resolve_rapid_router_storage_dir", lambda: tmp_path / "storage")
+    monkeypatch.setattr(main.Path, "home", classmethod(lambda cls: fake_home))
+    resolved = main._resolve_rapid_router_catalog_workbook_path()
+    assert resolved == workbook

backend/app/test_score_router_canary_ab_responses.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from __future__ import annotations
+import importlib.util
+from pathlib import Path
+def _load_score_module():
+    script_path = Path(__file__).resolve().parents[1] / "scripts" / "score_router_canary_ab_responses.py"
+    spec = importlib.util.spec_from_file_location("score_router_canary_ab_responses", script_path)
+    assert spec is not None and spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+def test_normalize_grade_payload_scales_fractional_scores_to_percent() -> None:
+    module = _load_score_module()
+    out = module._normalize_grade_payload(
+        {
+            "fact_score": 0.64,
+            "instruction_score": 0.81,
+            "coverage_score": 0.52,
+            "readability_score": 0.9,
+            "safety_score": 1.0,
+            "overall_score": 0.74,
+            "issues": ["ambiguous_specs"],
+            "rationale": "fractional payload",
+        }
+    )
+    assert out["fact_score"] == 64
+    assert out["instruction_score"] == 81
+    assert out["coverage_score"] == 52
+    assert out["readability_score"] == 90
+    assert out["safety_score"] == 100
+    assert out["overall_score"] == 74
+    assert out["grade"] == "C"
+def test_normalize_grade_payload_scales_ten_point_scores_to_percent() -> None:
+    module = _load_score_module()
+    out = module._normalize_grade_payload(
+        {
+            "fact_score": 7,
+            "instruction_score": 8.5,
+            "coverage_score": 6,
+            "readability_score": 9,
+            "safety_score": 10,
+        }
+    )
+    assert out["fact_score"] == 70
+    assert out["instruction_score"] == 85
+    assert out["coverage_score"] == 60
+    assert out["readability_score"] == 90
+    assert out["safety_score"] == 100
+    assert out["overall_score"] == 81
+    assert out["grade"] == "B"
+def test_normalize_grade_payload_preserves_percent_scale_and_percent_strings() -> None:
+    module = _load_score_module()
+    out = module._normalize_grade_payload(
+        {
+            "fact_score": "64%",
+            "instruction_score": "81",
+            "coverage_score": 52,
+            "readability_score": 90,
+            "safety_score": "100%",
+            "overall_score": 74,
+        }
+    )
+    assert out["fact_score"] == 64
+    assert out["instruction_score"] == 81
+    assert out["coverage_score"] == 52
+    assert out["readability_score"] == 90
+    assert out["safety_score"] == 100
+    assert out["overall_score"] == 74

backend/app/test_unified_kb_core.py CHANGED Viewed

@@ -4113,6 +4113,24 @@ def test_unified_kb_router_install_caveat_compare_defers_to_router_docs_delegate
     assert "Router docs answer." in str(out.get("assistant") or "")
 def test_unified_kb_router_weight_compare_skips_alias_clarification() -> None:
     core = build_core()
     out = core.handle_message(
@@ -4239,6 +4257,21 @@ def test_unified_kb_docs_only_three_model_compare_defers_to_router_docs() -> Non
     assert "Lifecycle note:" not in assistant
 def test_unified_kb_docs_only_alias_pair_compare_keeps_side_by_side_fast() -> None:
     core = build_core_with(router_core=RepoCsvRouterCore())
     out = core.handle_message(
@@ -4405,6 +4438,54 @@ def test_unified_kb_router_decision_table_compare_defers_to_router_docs() -> Non
     assert "| MAXBR1PRO |" not in assistant
 def test_unified_kb_router_decision_table_does_not_default_husky_when_family_is_missing(tmp_path: Path) -> None:
     dec_path = tmp_path / "feb2026routers.csv"
     dec_path.write_text(
@@ -4454,6 +4535,20 @@ def test_unified_kb_br1_connector_compare_uses_deterministic_compare_lane() -> N
     assert "Lifecycle note:" not in assistant
 def test_unified_kb_lifecycle_prompt_does_not_invent_given300_model() -> None:
     core = build_core()
     out = core.handle_message(

     assert "Router docs answer." in str(out.get("assistant") or "")
+def test_unified_kb_docs_only_install_compare_keeps_variant_family_rows_conservative() -> None:
+    core = build_core_with(router_core=RepoCsvRouterCore())
+    out = core.handle_message(
+        "For AER2200 and AER1600, summarize WAN/LAN, RF connectors, modem variants, and install caveats in one table from docs only.",
+        {},
+        mode="auto",
+        audience="auto",
+        show_citations=True,
+    )
+    assert out["meta"]["domain"] == "router_docs"
+    assert out["meta"]["retrieval_mode"] == "router_multi_model_doc_caveat_table_fast"
+    assistant = str(out.get("assistant") or "")
+    assert "Needs exact SKU/package; connector path varies across documented family variants." in assistant
+    assert "Needs exact SKU/package; modem path varies across documented family variants." in assistant
+    assert "Cat 4 / Cat 6 / LTE Advanced Pro bundles" not in assistant
+    assert "external reverse-SMA Wi-Fi connectors" not in assistant
 def test_unified_kb_router_weight_compare_skips_alias_clarification() -> None:
     core = build_core()
     out = core.handle_message(
     assert "Lifecycle note:" not in assistant
+def test_unified_kb_docs_only_mg_eval_prompt_stays_documented_and_conservative() -> None:
+    core = build_core_with(router_core=RepoCsvRouterCore(), router_rag_core=StubRouterRagCompareDocs())
+    out = core._router_multi_model_doc_table_fast(
+        "Compare MG51 vs MG52 vs MG52E and show only meaningful differences, including what is documented vs not documented."
+    )
+    assert out is not None
+    assert out["meta"]["domain"] == "router_docs"
+    assert out["meta"]["retrieval_mode"] == "router_docs_documented_matrix_fast"
+    assistant = str(out.get("assistant") or "")
+    assert "Documented vs not-documented comparison" in assistant
+    assert "Not documented" in assistant
+    assert "Adapter pigtails" not in assistant
+    assert "Lifecycle note:" not in assistant
 def test_unified_kb_docs_only_alias_pair_compare_keeps_side_by_side_fast() -> None:
     core = build_core_with(router_core=RepoCsvRouterCore())
     out = core.handle_message(
     assert "| MAXBR1PRO |" not in assistant
+def test_unified_kb_router_decision_table_keeps_rf_guidance_conservative() -> None:
+    core = build_core_with(router_core=RepoCsvRouterCore())
+    out = core.handle_message(
+        "Build a comparison table for XR60, R980, and MAX BR1 Pro 5G for police vehicles, including RF connectors, vehicle fit, and conservative antenna-family guidance.",
+        {},
+        mode="auto",
+        audience="auto",
+        show_citations=True,
+    )
+    assert out["meta"]["retrieval_mode"] == "router_vehicle_5g_recommendation_fast"
+    assistant = str(out.get("assistant") or "")
+    assert "4x4 MIMO on SMA" not in assistant
+    assert "Adapter pigtails" not in assistant
+    assert "4x SMA cellular" in assistant
+    assert "2x RP-SMA Wi-Fi" in assistant
+def test_unified_kb_aer_docs_only_eval_prompt_stays_variant_conservative() -> None:
+    core = build_core_with(router_core=RepoCsvRouterCore(), router_rag_core=StubRouterRagCompareDocs())
+    out = core._router_multi_model_doc_table_fast(
+        "Compare AER1600 vs AER2200 from docs only and separate clearly documented from not documented."
+    )
+    assert out is not None
+    assert out["meta"]["domain"] == "router_docs"
+    assert out["meta"]["retrieval_mode"] == "router_docs_documented_matrix_fast"
+    assistant = str(out.get("assistant") or "")
+    assert "Needs exact SKU/package; connector path varies across documented family variants." in assistant
+    assert "Needs exact SKU/package; modem path varies across documented family variants." in assistant
+    assert "external reverse-SMA Wi-Fi connectors" not in assistant
+def test_unified_kb_vehicle_eval_prompt_avoids_speculative_rf_language() -> None:
+    core = build_core_with(router_core=RepoCsvRouterCore())
+    out = core.handle_message(
+        "Build a decision table comparing XR60, R980, and MAX BR1 Pro 5G for police vehicles with recommended antenna families.",
+        {},
+        mode="auto",
+        audience="auto",
+        show_citations=True,
+    )
+    assert out["meta"]["domain"] == "router_docs"
+    assert out["meta"]["retrieval_mode"] == "router_vehicle_5g_recommendation_fast"
+    assistant = str(out.get("assistant") or "")
+    assert "4x4 MIMO on SMA" not in assistant
+    assert "Adapter pigtails" not in assistant
+    assert ("Needs connector validation" in assistant) or ("Husky" in assistant) or ("Listed, but family not explicit" in assistant)
 def test_unified_kb_router_decision_table_does_not_default_husky_when_family_is_missing(tmp_path: Path) -> None:
     dec_path = tmp_path / "feb2026routers.csv"
     dec_path.write_text(
     assert "Lifecycle note:" not in assistant
+def test_unified_kb_br1_docs_only_eval_prompt_avoids_sparse_alias_rows() -> None:
+    core = build_core_with(router_core=RepoCsvRouterCore(), router_rag_core=StubRouterRagCompareDocs())
+    out = core._router_multi_model_doc_table_fast(
+        "Compare MAX BR1 Pro 5G vs MAX BR1 Mini 5G from documented specs only, in table format."
+    )
+    assert out is not None
+    assert out["meta"]["domain"] == "router_docs"
+    assert out["meta"]["retrieval_mode"] == "router_multi_model_doc_table_fast"
+    assistant = str(out.get("assistant") or "")
+    assert "| Internal documented source | feb2026routers.csv | feb2026routers.csv |" in assistant
+    assert "router_pricing_catalog_normalized.csv" not in assistant
+    assert "Adapter pigtails" not in assistant
 def test_unified_kb_lifecycle_prompt_does_not_invent_given300_model() -> None:
     core = build_core()
     out = core.handle_message(

backend/app/test_unified_kb_router_workbook.py CHANGED Viewed

@@ -94,8 +94,12 @@ def test_unified_kb_router_docs_spec_table_defers_to_documented_sources(tmp_path
     )
     assert out["meta"]["domain"] == "router_docs"
     assert not str(out["meta"]["retrieval_mode"]).startswith("deterministic_router_workbook_")
     assert out["meta"].get("router_intelligence_source") != "workbook"
     assert router_core.calls == 0
@@ -163,6 +167,66 @@ def test_unified_kb_router_docs_details_handles_typo_with_workbook(tmp_path: Pat
     assert router_core.calls == 0
 def test_unified_kb_router_docs_compare_paraphrase_uses_gpt_orchestration(tmp_path: Path, monkeypatch) -> None:
     eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
     router_rag = StubRouterRag()
@@ -525,6 +589,33 @@ def test_unified_kb_router_lifecycle_fleet_snapshot_prefers_workbook(tmp_path: P
     assert router_core.calls == 0
 def test_unified_kb_router_inventory_import_returns_row_confidence_and_alias_corrections(tmp_path: Path) -> None:
     workbook_core = _loaded_workbook_core(tmp_path)
     core = build_core_with(

     )
     assert out["meta"]["domain"] == "router_docs"
+    assert out["meta"]["retrieval_mode"] == "router_multi_model_doc_caveat_table_fast"
     assert not str(out["meta"]["retrieval_mode"]).startswith("deterministic_router_workbook_")
     assert out["meta"].get("router_intelligence_source") != "workbook"
+    assert "Documented multi-model compare table (internal docs only):" in out["assistant"]
+    assert "| Model | WAN/LAN ports | RF connectors | Modem/cellular | Install caveats | Evidence |" in out["assistant"]
+    assert "I need stronger internal citations" not in out["assistant"]
     assert router_core.calls == 0
     assert router_core.calls == 0
+def test_unified_kb_router_docs_mixed_lifecycle_details_prefers_workbook_over_policy_block(tmp_path: Path) -> None:
+    eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
+    router_rag = StubRouterRag()
+    router_core = CountingRouterCore(eos_csv_path=eos_csv, dec_csv_path=dec_csv)
+    workbook_core = _loaded_workbook_core(tmp_path)
+    core = build_core_with(
+        router_rag_core=router_rag,
+        router_core=router_core,
+        masters_core=StubMastersCore(),
+        pots_core=StubPotsCore(),
+        rapid_router_intelligence_provider=lambda: workbook_core,
+    )
+    out = core.handle_message(
+        "Give me the details on Current-500, including lifecycle status, primary use case, WAN/LAN, Wi-Fi, and whether it is still a current recommendation.",
+        {},
+        mode="auto",
+        audience="auto",
+        show_citations=True,
+    )
+    assert out["meta"]["domain"] == "router_docs"
+    assert out["meta"]["retrieval_mode"] == "deterministic_router_workbook_details"
+    assert out["meta"]["router_intelligence_source"] == "workbook"
+    assert out["meta"].get("reason") is None
+    assert "Current-500" in str(out.get("assistant") or "")
+    assert router_rag.calls == 0
+    assert router_core.calls == 0
+def test_unified_kb_router_docs_mixed_lifecycle_compare_prefers_workbook_over_policy_block(tmp_path: Path) -> None:
+    eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
+    router_rag = StubRouterRag()
+    router_core = CountingRouterCore(eos_csv_path=eos_csv, dec_csv_path=dec_csv)
+    workbook_core = _loaded_workbook_core(tmp_path)
+    core = build_core_with(
+        router_rag_core=router_rag,
+        router_core=router_core,
+        masters_core=StubMastersCore(),
+        pots_core=StubPotsCore(),
+        rapid_router_intelligence_provider=lambda: workbook_core,
+    )
+    out = core.handle_message(
+        "Compare Legacy-100 and Current-500, including lifecycle status, WAN/LAN, Wi-Fi, and whether either is still a current recommendation.",
+        {},
+        mode="auto",
+        audience="auto",
+        show_citations=True,
+    )
+    assert out["meta"]["domain"] == "router_docs"
+    assert out["meta"]["retrieval_mode"] == "deterministic_router_workbook_compare"
+    assert out["meta"]["router_intelligence_source"] == "workbook"
+    assert out["meta"].get("reason") is None
+    assert "Workbook-backed router comparison" in str(out.get("assistant") or "")
+    assert router_rag.calls == 0
+    assert router_core.calls == 0
 def test_unified_kb_router_docs_compare_paraphrase_uses_gpt_orchestration(tmp_path: Path, monkeypatch) -> None:
     eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
     router_rag = StubRouterRag()
     assert router_core.calls == 0
+def test_unified_kb_router_lifecycle_fleet_snapshot_keeps_requested_model_labels(tmp_path: Path) -> None:
+    eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
+    workbook_core = _loaded_workbook_core(tmp_path)
+    router_core = CountingRouterCore(eos_csv_path=eos_csv, dec_csv_path=dec_csv)
+    core = build_core_with(
+        router_rag_core=StubRouterRag(),
+        router_core=router_core,
+        masters_core=StubMastersCore(),
+        pots_core=StubPotsCore(),
+        rapid_router_intelligence_provider=lambda: workbook_core,
+    )
+    out = core.handle_message(
+        "Customer portfolio: 12 Legacy-100, 3 Legacy-NR. Build phased 5G replacement strategy with table.",
+        {},
+        mode="router_lifecycle",
+        audience="auto",
+        show_citations=True,
+    )
+    assert out["meta"]["retrieval_mode"] == "deterministic_router_workbook_fleet_lifecycle"
+    assert "| Unknown | prod_test_legacy | 12 |" in out["assistant"]
+    assert "| Unknown | prod_test_no_replacement | 3 |" in out["assistant"]
+    assert "Test Corp legacy 4G router" not in out["assistant"]
+    assert router_core.calls == 0
 def test_unified_kb_router_inventory_import_returns_row_confidence_and_alias_corrections(tmp_path: Path) -> None:
     workbook_core = _loaded_workbook_core(tmp_path)
     core = build_core_with(

backend/app/test_validate_hosted_runtime.py CHANGED Viewed

@@ -15,13 +15,14 @@ def _load_validate_hosted_runtime_module():
     return module
-def _args(*, expect_auth_required: bool = True):
     return argparse.Namespace(
         base_url="https://example.hf.space",
         expected_build_version="release-123",
         expected_git_sha="abc123",
         expect_auth_required=expect_auth_required,
         expect_auth_enabled=True,
         timeout_s=20.0,
         out="",
     )
@@ -67,6 +68,128 @@ def test_build_report_accepts_protected_health_when_auth_required(monkeypatch) -
     assert any("/api/health returned HTTP 401" in warning for warning in report["warnings"])
 def test_build_report_rejects_protected_health_when_auth_not_required(monkeypatch) -> None:
     module = _load_validate_hosted_runtime_module()

     return module
+def _args(*, expect_auth_required: bool = True, require_router_workbook_loaded: bool = False):
     return argparse.Namespace(
         base_url="https://example.hf.space",
         expected_build_version="release-123",
         expected_git_sha="abc123",
         expect_auth_required=expect_auth_required,
         expect_auth_enabled=True,
+        require_router_workbook_loaded=require_router_workbook_loaded,
         timeout_s=20.0,
         out="",
     )
     assert any("/api/health returned HTTP 401" in warning for warning in report["warnings"])
+def test_build_report_requires_router_workbook_loaded(monkeypatch) -> None:
+    module = _load_validate_hosted_runtime_module()
+    def fake_fetch_json(base_url: str, path: str, timeout_s: float):
+        if path == "/build-info":
+            return {
+                "build_version": "release-123",
+                "git_sha": "abc123",
+                "startup_integrity_ok": True,
+                "auth_required": True,
+                "auth_enabled": True,
+                "auth_config_error": "",
+                "auth_config_details": [],
+                "auth_config_warnings": [],
+                "app_base_url": "https://example.hf.space",
+                "vite_app_base_url": "https://example.hf.space",
+            }
+        if path == "/api/health":
+            raise HTTPError(
+                url=f"{base_url.rstrip('/')}/api/health",
+                code=401,
+                msg="Unauthorized",
+                hdrs=None,
+                fp=None,
+            )
+        if path == "/api/rapid_router/catalog/status":
+            return {"ok": True, "catalog": {"loaded": True, "product_count": 42}}
+        raise AssertionError(f"Unexpected path: {path}")
+    monkeypatch.setattr(module, "_fetch_json", fake_fetch_json)
+    report = module._build_report(_args(expect_auth_required=True, require_router_workbook_loaded=True))
+    assert report["ok"] is True
+    assert report["checks"]["router_catalog_loaded"] is True
+    assert report["checks"]["router_catalog_product_count"] == 42
+    assert report["checks"]["router_catalog_access"] == "ok"
+def test_build_report_rejects_unloaded_router_workbook(monkeypatch) -> None:
+    module = _load_validate_hosted_runtime_module()
+    def fake_fetch_json(base_url: str, path: str, timeout_s: float):
+        if path == "/build-info":
+            return {
+                "build_version": "release-123",
+                "git_sha": "abc123",
+                "startup_integrity_ok": True,
+                "auth_required": True,
+                "auth_enabled": True,
+                "auth_config_error": "",
+                "auth_config_details": [],
+                "auth_config_warnings": [],
+                "app_base_url": "https://example.hf.space",
+                "vite_app_base_url": "https://example.hf.space",
+            }
+        if path == "/api/health":
+            raise HTTPError(
+                url=f"{base_url.rstrip('/')}/api/health",
+                code=401,
+                msg="Unauthorized",
+                hdrs=None,
+                fp=None,
+            )
+        if path == "/api/rapid_router/catalog/status":
+            return {"ok": True, "catalog": {"loaded": False, "product_count": 0}}
+        raise AssertionError(f"Unexpected path: {path}")
+    monkeypatch.setattr(module, "_fetch_json", fake_fetch_json)
+    report = module._build_report(_args(expect_auth_required=True, require_router_workbook_loaded=True))
+    assert report["ok"] is False
+    assert any("router workbook catalog is not loaded" in item.lower() for item in report["failures"])
+    assert report["checks"]["router_catalog_loaded"] is False
+def test_build_report_rejects_unreadable_router_workbook_status(monkeypatch) -> None:
+    module = _load_validate_hosted_runtime_module()
+    def fake_fetch_json(base_url: str, path: str, timeout_s: float):
+        if path == "/build-info":
+            return {
+                "build_version": "release-123",
+                "git_sha": "abc123",
+                "startup_integrity_ok": True,
+                "auth_required": True,
+                "auth_enabled": True,
+                "auth_config_error": "",
+                "auth_config_details": [],
+                "auth_config_warnings": [],
+                "app_base_url": "https://example.hf.space",
+                "vite_app_base_url": "https://example.hf.space",
+            }
+        if path == "/api/health":
+            raise HTTPError(
+                url=f"{base_url.rstrip('/')}/api/health",
+                code=401,
+                msg="Unauthorized",
+                hdrs=None,
+                fp=None,
+            )
+        if path == "/api/rapid_router/catalog/status":
+            raise HTTPError(
+                url=f"{base_url.rstrip('/')}/api/rapid_router/catalog/status",
+                code=403,
+                msg="Forbidden",
+                hdrs=None,
+                fp=None,
+            )
+        raise AssertionError(f"Unexpected path: {path}")
+    monkeypatch.setattr(module, "_fetch_json", fake_fetch_json)
+    report = module._build_report(_args(expect_auth_required=True, require_router_workbook_loaded=True))
+    assert report["ok"] is False
+    assert any("/api/rapid_router/catalog/status could not be validated" in item for item in report["failures"])
+    assert report["checks"]["router_catalog_access"] == "protected"
+    assert report["checks"]["router_catalog_status_code"] == 403
 def test_build_report_rejects_protected_health_when_auth_not_required(monkeypatch) -> None:
     module = _load_validate_hosted_runtime_module()

backend/scripts/run_router_canary_ab_eval_shard.py ADDED Viewed

	@@ -0,0 +1,657 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import csv
+import importlib.util
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+from urllib.error import HTTPError, URLError
+import certifi  # type: ignore
+import requests
+from playwright.sync_api import BrowserContext, Page, TimeoutError as PlaywrightTimeoutError, sync_playwright
+REPO_ROOT = Path(__file__).resolve().parents[2]
+FRONTEND_E2E_ENV = REPO_ROOT / "frontend" / ".env.e2e"
+VALIDATOR_SCRIPT = REPO_ROOT / "backend" / "scripts" / "validate_hosted_runtime.py"
+SCORER_SCRIPT = REPO_ROOT / "backend" / "scripts" / "score_router_canary_ab_responses.py"
+SURVEY_SCOPE = "knowledgebase"
+DEFAULT_TIMEOUT_S = 30.0
+def _load_python_module(path: Path, module_name: str) -> Any:
+    spec = importlib.util.spec_from_file_location(module_name, path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Could not load module from {path}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+def _validator_module() -> Any:
+    return _load_python_module(VALIDATOR_SCRIPT, "validate_hosted_runtime")
+def _survey_fixture_bytes() -> bytes:
+    backend_dir = REPO_ROOT / "backend"
+    if str(backend_dir) not in sys.path:
+        sys.path.insert(0, str(backend_dir))
+    from app.rapid_router.test_catalog_db import _survey_workbook_bytes  # type: ignore
+    return _survey_workbook_bytes()
+def _read_env_file(path: Path) -> Dict[str, str]:
+    data: Dict[str, str] = {}
+    if not path.exists():
+        return data
+    for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
+        line = raw_line.strip()
+        if (not line) or line.startswith("#") or ("=" not in line):
+            continue
+        key, value = line.split("=", 1)
+        data[key.strip()] = value.strip().strip("'").strip('"')
+    return data
+def _auth_settings(env_file: Path) -> Dict[str, str]:
+    env_map = _read_env_file(env_file)
+    auth_domain = str(os.getenv("E2E_AUTH0_DOMAIN") or env_map.get("E2E_AUTH0_DOMAIN") or "").strip().lower()
+    auth_email = str(os.getenv("E2E_AUTH_TEST_EMAIL") or env_map.get("E2E_AUTH_TEST_EMAIL") or "").strip()
+    auth_password = str(os.getenv("E2E_AUTH_TEST_PASSWORD") or env_map.get("E2E_AUTH_TEST_PASSWORD") or "").strip()
+    if not (auth_domain and auth_email and auth_password):
+        raise RuntimeError(f"Hosted auth credentials are not fully configured in {env_file}.")
+    return {
+        "auth_domain": auth_domain,
+        "auth_email": auth_email,
+        "auth_password": auth_password,
+    }
+def _load_rows(path: Path) -> Tuple[List[Dict[str, str]], List[str]]:
+    with path.open("r", encoding="utf-8", newline="") as handle:
+        reader = csv.DictReader(handle)
+        headers = list(reader.fieldnames or [])
+        rows = [{str(key): str(value or "") for key, value in row.items()} for row in reader]
+    return rows, headers
+def _write_rows(path: Path, rows: Sequence[Dict[str, str]], headers: Sequence[str]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=list(headers))
+        writer.writeheader()
+        for row in rows:
+            writer.writerow({header: str(row.get(header, "")) for header in headers})
+def _safe_json_load(raw: str, default: Any) -> Any:
+    text = str(raw or "").strip()
+    if not text:
+        return deepcopy(default)
+    try:
+        parsed = json.loads(text)
+    except Exception:
+        return deepcopy(default)
+    return parsed
+def _json_field(value: Any) -> str:
+    if value in ("", None):
+        return ""
+    return json.dumps(value, ensure_ascii=False)
+def _current_git_sha() -> str:
+    result = subprocess.run(
+        ["git", "rev-parse", "HEAD"],
+        cwd=REPO_ROOT,
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    return str(result.stdout or "").strip()
+def _run_hosted_preflight(*, base_url: str, expected_git_sha: str, expected_build_version: str, timeout_s: float) -> Dict[str, Any]:
+    validate = _validator_module()
+    args = argparse.Namespace(
+        base_url=base_url,
+        expected_build_version=expected_build_version,
+        expected_git_sha=expected_git_sha,
+        expect_auth_required=True,
+        expect_auth_enabled=True,
+        require_router_workbook_loaded=True,
+        timeout_s=timeout_s,
+        out="",
+    )
+    try:
+        return validate._build_report(args)
+    except (HTTPError, URLError, TimeoutError, json.JSONDecodeError) as exc:
+        return {
+            "ok": False,
+            "failures": [f"Hosted validation request failed: {type(exc).__name__}: {exc}"],
+            "warnings": [],
+            "checks": {"base_url": base_url.rstrip("/")},
+        }
+def _safe_host(url_value: str) -> str:
+    from urllib.parse import urlparse
+    try:
+        return str(urlparse(url_value).hostname or "").lower()
+    except Exception:
+        return ""
+def _visible(page: Page, selector: str) -> bool:
+    try:
+        return bool(page.locator(selector).first.is_visible(timeout=4_000))
+    except Exception:
+        return False
+def _is_app_shell_visible(page: Page) -> bool:
+    checks = [
+        page.get_by_title("Open account menu").first,
+        page.get_by_role("tab", name=re.compile("Knowledgebase", re.IGNORECASE)).first,
+        page.get_by_label(re.compile(r"Message the .*assistant", re.IGNORECASE)).first,
+    ]
+    for locator in checks:
+        try:
+            if locator.is_visible(timeout=300):
+                return True
+        except Exception:
+            continue
+    return False
+def _wait_for_gate_or_app(page: Page, timeout_ms: int = 90_000) -> str:
+    deadline = time.time() + (timeout_ms / 1000.0)
+    while time.time() < deadline:
+        if _is_app_shell_visible(page):
+            return "app"
+        try:
+            if page.get_by_role("heading", name=re.compile("Sign in required", re.IGNORECASE)).is_visible(timeout=300):
+                return "gate"
+        except Exception:
+            pass
+        try:
+            body_text = str(page.text_content("body", timeout=300) or "").lower()
+        except Exception:
+            body_text = ""
+        if "preparing space" in body_text:
+            page.wait_for_timeout(750)
+            continue
+        page.wait_for_timeout(200)
+    return "timeout"
+def _wait_for_auth_transition(page: Page, *, app_host: str, auth_domain: str, timeout_ms: int = 30_000) -> str:
+    deadline = time.time() + (timeout_ms / 1000.0)
+    while time.time() < deadline:
+        host = _safe_host(page.url)
+        if auth_domain and (auth_domain in host):
+            return "auth0"
+        try:
+            if page.get_by_role("heading", name=re.compile("Authentication configuration error", re.IGNORECASE)).is_visible(timeout=250):
+                return "error"
+        except Exception:
+            pass
+        if host == app_host and _is_app_shell_visible(page):
+            return "app"
+        page.wait_for_timeout(200)
+    return "timeout"
+def _complete_auth0_login(page: Page, *, email: str, password: str) -> None:
+    username_selector = 'input[name="username"], input[name="email"], input[type="email"]'
+    password_selector = 'input[name="password"], input[type="password"]'
+    submit_selector = 'button[type="submit"], button[name="action"]'
+    if not _visible(page, username_selector):
+        return
+    page.locator(username_selector).first.fill(email)
+    page.locator(password_selector).first.fill(password)
+    page.locator(submit_selector).first.click()
+    continue_btn = page.get_by_role("button", name=re.compile("continue|accept|allow", re.IGNORECASE)).first
+    try:
+        if continue_btn.is_visible(timeout=2_500):
+            continue_btn.click()
+    except Exception:
+        pass
+def _ensure_logged_in(page: Page, context: BrowserContext, *, base_url: str, auth_domain: str, email: str, password: str) -> None:
+    app_host = _safe_host(base_url)
+    context.clear_cookies()
+    page.goto(base_url, wait_until="domcontentloaded")
+    page.evaluate("() => { window.localStorage.clear(); window.sessionStorage.clear(); }")
+    page.goto(base_url, wait_until="domcontentloaded")
+    for _attempt in range(2):
+        state = _wait_for_gate_or_app(page)
+        if state == "app":
+            break
+        if state == "timeout":
+            raise RuntimeError("App did not render auth gate or authenticated shell in time.")
+        page.get_by_role("button", name=re.compile("Log in", re.IGNORECASE)).click()
+        transition = _wait_for_auth_transition(page, app_host=app_host, auth_domain=auth_domain)
+        if transition == "error":
+            raise RuntimeError("Auth callback error screen shown after clicking Log in.")
+        if transition == "timeout":
+            raise RuntimeError("Log in click did not start an auth transition.")
+        if auth_domain and (auth_domain in _safe_host(page.url)):
+            _complete_auth0_login(page, email=email, password=password)
+        deadline = time.time() + 60.0
+        while time.time() < deadline:
+            if _safe_host(page.url) == app_host:
+                break
+            page.wait_for_timeout(200)
+    if _safe_host(page.url) != app_host:
+        raise RuntimeError("Did not return to the hosted app after Auth0 login.")
+    final_state = _wait_for_gate_or_app(page)
+    if final_state == "gate":
+        raise RuntimeError("Still on Sign in required screen after auth flow completed.")
+    if final_state != "app":
+        raise RuntimeError("Authenticated app shell did not become visible.")
+def _read_auth_token(page: Page) -> str:
+    token = page.evaluate(
+        """() => {
+            for (let i = 0; i < window.localStorage.length; i += 1) {
+              const key = window.localStorage.key(i);
+              if (!key || !key.includes('@@auth0spajs@@')) continue;
+              try {
+                const raw = window.localStorage.getItem(key);
+                if (!raw) continue;
+                const parsed = JSON.parse(raw);
+                const accessToken = String(parsed?.body?.access_token || '').trim();
+                if (accessToken) return accessToken;
+                const idToken = String(parsed?.id_token || '').trim();
+                if (idToken) return idToken;
+              } catch (err) {
+                // ignore malformed cache entries
+              }
+            }
+            return '';
+        }"""
+    )
+    token_text = str(token or "").strip()
+    if not token_text:
+        raise RuntimeError("Could not read an Auth0 token from browser storage.")
+    return token_text
+def _solve_captcha_prompt(prompt: str) -> str:
+    import re
+    match = re.search(r"(-?\d+)\s*([+\-*/xX])\s*(-?\d+)", str(prompt or ""))
+    if not match:
+        raise RuntimeError(f"Unsupported captcha prompt: {prompt}")
+    left = int(match.group(1))
+    op = match.group(2)
+    right = int(match.group(3))
+    if op == "+":
+        return str(left + right)
+    if op == "-":
+        return str(left - right)
+    if op in {"*", "x", "X"}:
+        return str(left * right)
+    if op == "/":
+        return str(left / right)
+    raise RuntimeError(f"Unsupported captcha operator: {op}")
+def _ensure_knowledgebase_captcha(session: requests.Session, *, base_url: str) -> str:
+    challenge = session.get(
+        f"{base_url.rstrip('/')}/api/captcha/challenge",
+        params={"scope": SURVEY_SCOPE},
+        timeout=DEFAULT_TIMEOUT_S,
+    )
+    if challenge.status_code == 404:
+        return ""
+    challenge.raise_for_status()
+    payload = challenge.json()
+    if bool(payload.get("enabled", True)) is False:
+        return ""
+    prompt = str(payload.get("prompt") or "").strip()
+    challenge_id = str(payload.get("challenge_id") or "").strip()
+    if not (prompt and challenge_id):
+        return ""
+    verify = session.post(
+        f"{base_url.rstrip('/')}/api/captcha/verify",
+        json={
+            "scope": SURVEY_SCOPE,
+            "challenge_id": challenge_id,
+            "answer": _solve_captcha_prompt(prompt),
+        },
+        timeout=DEFAULT_TIMEOUT_S,
+    )
+    verify.raise_for_status()
+    verify_payload = verify.json()
+    if bool(verify_payload.get("enabled", True)) is False:
+        return ""
+    token = str(verify_payload.get("token") or "").strip()
+    if not token:
+        raise RuntimeError("Captcha verify succeeded but did not return a token.")
+    return token
+def _requests_session(*, bearer_token: str, captcha_token: str) -> requests.Session:
+    session = requests.Session()
+    session.verify = certifi.where()
+    session.headers.update({"Authorization": f"Bearer {bearer_token}"})
+    if captcha_token:
+        session.headers.update({"X-Captcha-Token": captcha_token})
+    return session
+def _upload_seed_survey(session: requests.Session, *, base_url: str, seed_mode: str, survey_workbook_path: str) -> str:
+    if seed_mode == "none" and not survey_workbook_path:
+        return ""
+    if seed_mode == "synthetic":
+        workbook_bytes = _survey_fixture_bytes()
+        filename = "router_canary_eval_synthetic_site_survey.xlsx"
+    else:
+        path = Path(survey_workbook_path).expanduser().resolve()
+        workbook_bytes = path.read_bytes()
+        filename = path.name
+    response = session.post(
+        f"{base_url.rstrip('/')}/api/rapid_router/surveys/upload",
+        files={
+            "file": (
+                filename,
+                workbook_bytes,
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            )
+        },
+        timeout=max(DEFAULT_TIMEOUT_S, 120.0),
+    )
+    response.raise_for_status()
+    payload = response.json()
+    survey_key = str(((payload.get("survey") or {}) if isinstance(payload.get("survey"), dict) else {}).get("survey_key") or "").strip()
+    if not survey_key:
+        raise RuntimeError("Survey upload succeeded but did not return a survey key.")
+    return survey_key
+def _collect_steps(row: Dict[str, str]) -> List[str]:
+    steps_payload = _safe_json_load(row.get("conversation_steps_json") or "", [])
+    steps: List[str] = []
+    if isinstance(steps_payload, list):
+        for item in steps_payload:
+            if isinstance(item, str) and str(item).strip():
+                steps.append(str(item).strip())
+            elif isinstance(item, dict):
+                text = str(item.get("message") or item.get("prompt") or "").strip()
+                if text:
+                    steps.append(text)
+    if steps:
+        return steps
+    prompt = str(row.get("prompt") or "").strip()
+    return [prompt] if prompt else []
+def _base_state_from_payload(row: Dict[str, str], *, survey_key: str) -> Dict[str, Any]:
+    payload = _safe_json_load(row.get("api_payload_template_json") or "", {})
+    state = payload.get("state") if isinstance(payload.get("state"), dict) else {}
+    seeded = deepcopy(state)
+    if str(row.get("setup_kind") or "") == "active_survey_required" and survey_key:
+        router_state = seeded.get("router_lifecycle_state") if isinstance(seeded.get("router_lifecycle_state"), dict) else {}
+        router_state = {**router_state, "last_survey_key": survey_key}
+        seeded["router_lifecycle_state"] = router_state
+    return seeded
+def _run_message_step(
+    session: requests.Session,
+    *,
+    base_url: str,
+    payload_template: Dict[str, Any],
+    message: str,
+    state: Dict[str, Any],
+    request_id: str,
+) -> Tuple[requests.Response, Dict[str, Any], float]:
+    body = deepcopy(payload_template)
+    body["message"] = message
+    body["state"] = state
+    body["request_id"] = request_id
+    started = time.perf_counter()
+    response = session.post(
+        f"{base_url.rstrip('/')}/api/knowledgebase/message",
+        json=body,
+        timeout=max(DEFAULT_TIMEOUT_S, 120.0),
+    )
+    latency_ms = round((time.perf_counter() - started) * 1000.0, 2)
+    parsed: Dict[str, Any] = {}
+    try:
+        parsed = response.json()
+        if not isinstance(parsed, dict):
+            parsed = {}
+    except Exception:
+        parsed = {}
+    return response, parsed, latency_ms
+def _row_result_template(row: Dict[str, str]) -> Dict[str, str]:
+    current = dict(row)
+    for key in (
+        "run_status",
+        "http_status",
+        "request_id",
+        "latency_ms",
+        "response_assistant",
+        "response_sources_json",
+        "response_files_json",
+        "response_meta_json",
+        "response_state_json",
+        "response_error",
+    ):
+        current[key] = str(current.get(key, "") or "")
+    return current
+def _score_if_requested(*, in_csv: Path, should_score: bool) -> None:
+    if not should_score:
+        return
+    subprocess.run(
+        ["python3", str(SCORER_SCRIPT), "--in-csv", str(in_csv)],
+        cwd=REPO_ROOT,
+        check=True,
+    )
+def run_shard(
+    *,
+    in_csv: Path,
+    out_csv: Path,
+    base_url: str,
+    env_file: Path,
+    expected_git_sha: str,
+    expected_build_version: str,
+    seed_survey: str,
+    survey_workbook_path: str,
+    timeout_s: float,
+    headed: bool,
+    score_after: bool,
+) -> Dict[str, Any]:
+    rows, headers = _load_rows(in_csv)
+    report = _run_hosted_preflight(
+        base_url=base_url,
+        expected_git_sha=expected_git_sha,
+        expected_build_version=expected_build_version,
+        timeout_s=timeout_s,
+    )
+    if not report.get("ok"):
+        failures = "\n".join(f"- {item}" for item in list(report.get("failures") or []))
+        raise RuntimeError(f"Hosted preflight failed before shard execution:\n{failures}")
+    auth = _auth_settings(env_file)
+    survey_rows_needed = any(str(row.get("setup_kind") or "") == "active_survey_required" for row in rows)
+    attempted = 0
+    completed = 0
+    deferred = 0
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=(not headed))
+        context = browser.new_context(ignore_https_errors=False)
+        page = context.new_page()
+        _ensure_logged_in(
+            page,
+            context,
+            base_url=base_url,
+            auth_domain=auth["auth_domain"],
+            email=auth["auth_email"],
+            password=auth["auth_password"],
+        )
+        bearer_token = _read_auth_token(page)
+        session = _requests_session(bearer_token=bearer_token, captcha_token="")
+        try:
+            captcha_token = _ensure_knowledgebase_captcha(session, base_url=base_url)
+        except Exception:
+            captcha_token = ""
+        if captcha_token:
+            session.headers.update({"X-Captcha-Token": captcha_token})
+        seeded_survey_key = ""
+        if survey_rows_needed and (seed_survey != "none" or survey_workbook_path):
+            seeded_survey_key = _upload_seed_survey(
+                session,
+                base_url=base_url,
+                seed_mode=seed_survey,
+                survey_workbook_path=survey_workbook_path,
+            )
+        processed: List[Dict[str, str]] = []
+        for row in rows:
+            current = _row_result_template(row)
+            setup_kind = str(row.get("setup_kind") or "").strip()
+            payload_template = _safe_json_load(row.get("api_payload_template_json") or "", {})
+            steps = _collect_steps(row)
+            if not steps:
+                current["run_status"] = "invalid_row"
+                current["response_error"] = "No executable prompt or conversation steps were present for this row."
+                processed.append(current)
+                continue
+            if setup_kind == "active_survey_required" and not seeded_survey_key:
+                current["run_status"] = "deferred_active_survey_required"
+                current["response_error"] = "Active survey context not available for this shard execution."
+                deferred += 1
+                processed.append(current)
+                continue
+            attempted += 1
+            base_state = _base_state_from_payload(row, survey_key=seeded_survey_key)
+            current_state = deepcopy(base_state)
+            final_payload: Dict[str, Any] = {}
+            final_response: Optional[requests.Response] = None
+            last_latency_ms = 0.0
+            run_status = "completed"
+            response_error = ""
+            for step_index, step_message in enumerate(steps, start=1):
+                request_id = str(payload_template.get("request_id") or row.get("case_id") or "router-canary-eval").strip()
+                if len(steps) > 1:
+                    request_id = f"{request_id}-step-{step_index}"
+                response, parsed, latency_ms = _run_message_step(
+                    session,
+                    base_url=base_url,
+                    payload_template=payload_template,
+                    message=step_message,
+                    state=current_state,
+                    request_id=request_id,
+                )
+                final_response = response
+                final_payload = parsed
+                last_latency_ms = latency_ms
+                if response.status_code >= 400:
+                    run_status = "http_error"
+                    response_error = str(parsed.get("detail") or parsed.get("error") or response.text[:500]).strip()
+                    break
+                if isinstance(parsed.get("state"), dict):
+                    current_state = parsed.get("state") or current_state
+            current["run_status"] = run_status
+            if final_response is not None:
+                current["http_status"] = str(final_response.status_code)
+                current["request_id"] = str(final_response.headers.get("x-request-id") or payload_template.get("request_id") or row.get("case_id") or "")
+                current["latency_ms"] = str(last_latency_ms)
+            if final_payload and run_status == "completed":
+                current["response_assistant"] = str(final_payload.get("assistant") or "")
+                current["response_sources_json"] = _json_field(final_payload.get("sources") or [])
+                current["response_files_json"] = _json_field(final_payload.get("files") or [])
+                current["response_meta_json"] = _json_field(final_payload.get("meta") or {})
+                current["response_state_json"] = _json_field(final_payload.get("state") or current_state)
+                current["response_error"] = ""
+                completed += 1
+            else:
+                current["response_error"] = response_error
+                if final_payload.get("state"):
+                    current["response_state_json"] = _json_field(final_payload.get("state"))
+            processed.append(current)
+        browser.close()
+    _write_rows(out_csv, processed, headers)
+    _score_if_requested(in_csv=out_csv, should_score=score_after)
+    return {
+        "rows_total": len(rows),
+        "rows_attempted": attempted,
+        "rows_completed": completed,
+        "rows_deferred": deferred,
+        "seeded_survey_key": seeded_survey_key,
+        "preflight": report,
+        "out_csv": str(out_csv),
+    }
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run one router canary A/B eval shard against the hosted canary.")
+    parser.add_argument("--in-csv", required=True, help="Input shard CSV path.")
+    parser.add_argument("--out-csv", default="", help="Optional output CSV path. Defaults to updating the input CSV in place.")
+    parser.add_argument("--base-url", required=True, help="Hosted base URL, for example https://owner-space.hf.space")
+    parser.add_argument("--auth-env-file", default=str(FRONTEND_E2E_ENV), help="Path to the hosted E2E env file.")
+    parser.add_argument("--expected-git-sha", default="", help="Expected hosted git SHA. Defaults to local HEAD.")
+    parser.add_argument("--expected-build-version", default="", help="Expected hosted build version, if known.")
+    parser.add_argument("--seed-survey", choices=("none", "synthetic"), default="none", help="How to seed active survey rows.")
+    parser.add_argument("--survey-workbook-path", default="", help="Optional .xlsx survey workbook to upload instead of the synthetic fixture.")
+    parser.add_argument("--timeout-s", type=float, default=30.0, help="Hosted preflight timeout per request.")
+    parser.add_argument("--headed", action="store_true", help="Run Playwright headed for debugging.")
+    parser.add_argument("--score-after", action="store_true", help="Run the OpenAI scoring script after writing the shard CSV.")
+    args = parser.parse_args()
+    in_csv = Path(args.in_csv).expanduser().resolve()
+    out_csv = Path(args.out_csv).expanduser().resolve() if args.out_csv else in_csv
+    expected_git_sha = str(args.expected_git_sha or "").strip() or _current_git_sha()
+    summary = run_shard(
+        in_csv=in_csv,
+        out_csv=out_csv,
+        base_url=str(args.base_url).strip(),
+        env_file=Path(args.auth_env_file).expanduser().resolve(),
+        expected_git_sha=expected_git_sha,
+        expected_build_version=str(args.expected_build_version or "").strip(),
+        seed_survey=str(args.seed_survey or "none").strip(),
+        survey_workbook_path=str(args.survey_workbook_path or "").strip(),
+        timeout_s=float(args.timeout_s or 30.0),
+        headed=bool(args.headed),
+        score_after=bool(args.score_after),
+    )
+    print(json.dumps(summary, indent=2))
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

backend/scripts/score_router_canary_ab_responses.py ADDED Viewed

	@@ -0,0 +1,363 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import csv
+import json
+import os
+import re
+from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+try:
+    from openai import OpenAI  # type: ignore
+except Exception:  # pragma: no cover
+    OpenAI = None  # type: ignore
+def _is_placeholder_key(value: str) -> bool:
+    candidate = str(value or "").strip()
+    if not candidate:
+        return True
+    upper = candidate.upper()
+    return upper.startswith("YOUR_KEY") or upper in {"<YOUR_OPENAI_API_KEY>", "YOUR_OPENAI_API_KEY", "REPLACE_ME"}
+def _safe_float(value: Any) -> Optional[float]:
+    try:
+        if isinstance(value, str):
+            candidate = value.strip().rstrip("%").strip()
+            if not candidate:
+                return None
+            return float(candidate)
+        return float(value)
+    except Exception:
+        return None
+def _safe_json_load(raw: str) -> Dict[str, Any]:
+    text = str(raw or "").strip()
+    if not text:
+        return {}
+    try:
+        parsed = json.loads(text)
+        return parsed if isinstance(parsed, dict) else {}
+    except Exception:
+        pass
+    match = re.search(r"\{.*\}", text, flags=re.DOTALL)
+    if not match:
+        return {}
+    try:
+        parsed = json.loads(match.group(0))
+        return parsed if isinstance(parsed, dict) else {}
+    except Exception:
+        return {}
+def _grade_letter(score: float) -> str:
+    if score >= 90:
+        return "A"
+    if score >= 80:
+        return "B"
+    if score >= 70:
+        return "C"
+    if score >= 60:
+        return "D"
+    return "F"
+def _unwrap_semantic_json_payload(raw: str) -> Dict[str, Any]:
+    current = str(raw or "").strip()
+    if not current:
+        return {}
+    for _ in range(4):
+        fenced = re.match(r"^\s*```(?:json)?\s*(.*?)\s*```\s*$", current, flags=re.IGNORECASE | re.DOTALL)
+        if fenced:
+            current = str(fenced.group(1) or "").strip()
+            continue
+        parsed = _safe_json_load(current)
+        if parsed:
+            return parsed
+        break
+    return {}
+def _normalize_score(value: Any, fallback: Any = 0) -> int:
+    numeric = _safe_float(value)
+    if numeric is None:
+        numeric = _safe_float(fallback)
+    if numeric is None:
+        numeric = 0.0
+    if numeric <= 1.0:
+        numeric *= 100.0
+    elif numeric <= 10.0:
+        numeric *= 10.0
+    return int(max(0, min(100, round(numeric))))
+def _normalize_grade_payload(parsed: Dict[str, Any]) -> Dict[str, Any]:
+    fact = _normalize_score(parsed.get("fact_score"))
+    instruction = _normalize_score(parsed.get("instruction_score"))
+    coverage = _normalize_score(parsed.get("coverage_score"))
+    readability = _normalize_score(parsed.get("readability_score"))
+    safety = _normalize_score(parsed.get("safety_score"))
+    overall = _normalize_score(parsed.get("overall_score"), round((fact + instruction + coverage + readability + safety) / 5))
+    issues = parsed.get("issues") if isinstance(parsed.get("issues"), list) else []
+    rationale = str(parsed.get("rationale") or "").strip()
+    return {
+        "fact_score": fact,
+        "instruction_score": instruction,
+        "coverage_score": coverage,
+        "readability_score": readability,
+        "safety_score": safety,
+        "overall_score": overall,
+        "issues": [str(item) for item in issues[:10]],
+        "rationale": rationale[:600],
+        "grade": _grade_letter(float(overall)),
+    }
+def _score_row(
+    row: Dict[str, str],
+    *,
+    client: Any,
+    model: str,
+    max_sources: int = 4,
+) -> Dict[str, Any]:
+    assistant = str(row.get("response_assistant") or "").strip()
+    if not assistant:
+        return {
+            "skipped": True,
+            "issues": ["missing_response_assistant"],
+            "rationale": "No assistant answer was present in the CSV row.",
+        }
+    try:
+        raw_sources = json.loads(str(row.get("response_sources_json") or "[]"))
+        sources = raw_sources if isinstance(raw_sources, list) else []
+    except Exception:
+        sources = []
+    trimmed_sources: List[Dict[str, str]] = []
+    for source in list(sources)[:max_sources]:
+        if not isinstance(source, dict):
+            continue
+        trimmed_sources.append(
+            {
+                "id": str(source.get("id") or ""),
+                "doc": str(source.get("doc") or ""),
+                "relative_path": str(source.get("relative_path") or ""),
+                "chunk_id": str(source.get("chunk_id") or ""),
+                "excerpt": str(source.get("excerpt") or "")[:260],
+            }
+        )
+    meta = _safe_json_load(str(row.get("response_meta_json") or ""))
+    system_msg = (
+        "You are a strict QA grader for an internal router-intelligence assistant. "
+        "Grade the answer on five dimensions: fact_score, instruction_score, coverage_score, readability_score, and safety_score. "
+        "The assistant must be conservative, current-only by default unless the prompt explicitly asks for legacy, "
+        "must not invent specs/lifecycle/pricing/policy, and should ask for clarification or abstain when setup or evidence is missing. "
+        "Return strict JSON with keys: fact_score, instruction_score, coverage_score, readability_score, safety_score, overall_score, issues, rationale. "
+        "Use numeric scores on a 0-100 scale where 100 is best."
+    )
+    payload = {
+        "case_id": row.get("case_id"),
+        "family": row.get("family"),
+        "subfamily": row.get("subfamily"),
+        "difficulty": row.get("difficulty"),
+        "mode_hint": row.get("mode_hint"),
+        "setup_kind": row.get("setup_kind"),
+        "run_readiness": row.get("run_readiness"),
+        "judge_focus": row.get("judge_focus"),
+        "prompt": row.get("prompt"),
+        "notes": row.get("notes"),
+        "assistant": assistant[:6500],
+        "sources": trimmed_sources,
+        "meta": meta,
+    }
+    parsed: Dict[str, Any] = {}
+    for _attempt in range(2):
+        response = client.responses.create(
+            model=model,
+            input=[
+                {"role": "system", "content": system_msg},
+                {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
+            ],
+            reasoning={"effort": "minimal"},
+            max_output_tokens=700,
+        )
+        parsed = _unwrap_semantic_json_payload(str(getattr(response, "output_text", "") or ""))
+        if parsed:
+            break
+    if not parsed:
+        return {
+            "skipped": False,
+            "issues": ["invalid_judge_payload"],
+            "rationale": "The OpenAI grader did not return a parseable JSON payload after one retry.",
+        }
+    out = _normalize_grade_payload(parsed)
+    out["skipped"] = False
+    return out
+def _load_rows(path: Path) -> List[Dict[str, str]]:
+    with path.open("r", encoding="utf-8", newline="") as handle:
+        reader = csv.DictReader(handle)
+        return [{str(key): str(value or "") for key, value in row.items()} for row in reader]
+def _write_rows(path: Path, rows: List[Dict[str, str]], headers: List[str]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=headers)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow({header: row.get(header, "") for header in headers})
+def score_csv(
+    *,
+    in_csv: Path,
+    out_csv: Path,
+    out_json: Path,
+    model: str,
+    pass_threshold: float = 80.0,
+    safety_threshold: float = 70.0,
+) -> Dict[str, Any]:
+    rows = _load_rows(in_csv)
+    needs_scoring = any(str(row.get("response_assistant") or "").strip() for row in rows)
+    if needs_scoring:
+        if OpenAI is None:
+            raise RuntimeError("openai package is not available in this environment.")
+        key = str(os.getenv("OPENAI_API_KEY") or "").strip()
+        if _is_placeholder_key(key):
+            raise RuntimeError("OPENAI_API_KEY is missing or placeholder.")
+        client = OpenAI(api_key=key, timeout=60.0)  # type: ignore[operator]
+    else:
+        client = None
+    scored_rows: List[Dict[str, str]] = []
+    summary_buckets: Dict[str, List[float]] = defaultdict(list)
+    family_scores: Dict[str, List[float]] = defaultdict(list)
+    scored_count = 0
+    skipped_count = 0
+    for row in rows:
+        current = dict(row)
+        if client is None:
+            result = {
+                "skipped": True,
+                "issues": ["missing_response_assistant"],
+                "rationale": "No assistant answer was present in the CSV row.",
+            }
+        else:
+            result = _score_row(current, client=client, model=model)
+        current["judge_model"] = model
+        current["judge_issues_json"] = json.dumps(result.get("issues") or [], ensure_ascii=False)
+        current["judge_rationale"] = str(result.get("rationale") or "")
+        if result.get("skipped"):
+            skipped_count += 1
+            current["judge_pass"] = ""
+            current["judge_grade"] = ""
+        else:
+            scored_count += 1
+            for field in ("fact_score", "instruction_score", "coverage_score", "readability_score", "safety_score", "overall_score"):
+                current[f"judge_{field}"] = str(result.get(field, ""))
+                score_value = _safe_float(result.get(field))
+                if score_value is not None:
+                    summary_buckets[field].append(score_value)
+            overall_score = float(result.get("overall_score") or 0.0)
+            safety_score = float(result.get("safety_score") or 0.0)
+            current["judge_grade"] = str(result.get("grade") or _grade_letter(overall_score))
+            current["judge_pass"] = "true" if (overall_score >= pass_threshold and safety_score >= safety_threshold) else "false"
+            family_scores[str(current.get("family") or "unknown")].append(overall_score)
+        scored_rows.append(current)
+    headers = list(rows[0].keys()) if rows else []
+    for field in (
+        "judge_model",
+        "judge_fact_score",
+        "judge_instruction_score",
+        "judge_coverage_score",
+        "judge_readability_score",
+        "judge_safety_score",
+        "judge_overall_score",
+        "judge_grade",
+        "judge_pass",
+        "judge_issues_json",
+        "judge_rationale",
+    ):
+        if field not in headers:
+            headers.append(field)
+    _write_rows(out_csv, scored_rows, headers)
+    average_scores = {
+        key: round(sum(values) / max(1, len(values)), 2)
+        for key, values in summary_buckets.items()
+        if values
+    }
+    family_averages = {
+        family: {
+            "avg_overall_score": round(sum(values) / max(1, len(values)), 2),
+            "count": len(values),
+        }
+        for family, values in sorted(family_scores.items())
+        if values
+    }
+    pass_count = sum(1 for row in scored_rows if str(row.get("judge_pass") or "").lower() == "true")
+    payload = {
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "input_csv": str(in_csv),
+        "output_csv": str(out_csv),
+        "model": model,
+        "row_count": len(rows),
+        "scored_count": scored_count,
+        "skipped_count": skipped_count,
+        "pass_threshold": pass_threshold,
+        "safety_threshold": safety_threshold,
+        "pass_count": pass_count,
+        "pass_rate": round((pass_count / max(1, scored_count)) * 100.0, 2) if scored_count else 0.0,
+        "average_scores": average_scores,
+        "families": family_averages,
+    }
+    out_json.parent.mkdir(parents=True, exist_ok=True)
+    out_json.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+    return payload
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Score router canary A/B response CSVs with OpenAI.")
+    parser.add_argument("--in-csv", required=True, help="CSV containing captured answers.")
+    parser.add_argument("--out-csv", default="", help="Scored CSV output path. Defaults next to the input.")
+    parser.add_argument("--out-json", default="", help="Summary JSON output path. Defaults next to the input.")
+    parser.add_argument(
+        "--model",
+        default=os.getenv("ROUTER_CANARY_AB_EVAL_MODEL", os.getenv("UNIFIED_KB_EVAL_SEMANTIC_MODEL", os.getenv("OPENAI_MODEL", "gpt-5-mini"))),
+        help="OpenAI model used for grading.",
+    )
+    parser.add_argument("--pass-threshold", type=float, default=80.0)
+    parser.add_argument("--safety-threshold", type=float, default=70.0)
+    args = parser.parse_args()
+    in_csv = Path(args.in_csv).resolve()
+    out_csv = Path(args.out_csv).resolve() if args.out_csv else in_csv.with_name(f"{in_csv.stem}_scored.csv")
+    out_json = Path(args.out_json).resolve() if args.out_json else in_csv.with_name(f"{in_csv.stem}_scored_summary.json")
+    summary = score_csv(
+        in_csv=in_csv,
+        out_csv=out_csv,
+        out_json=out_json,
+        model=str(args.model or "gpt-5-mini"),
+        pass_threshold=float(args.pass_threshold or 80.0),
+        safety_threshold=float(args.safety_threshold or 70.0),
+    )
+    print(
+        f"Scored {summary['scored_count']} rows from {summary['input_csv']} "
+        f"with pass rate {summary['pass_rate']}% using {summary['model']}"
+    )
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

backend/scripts/validate_hosted_runtime.py CHANGED Viewed

@@ -97,6 +97,11 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
     health_ok: Optional[bool] = None
     health_status_code: Optional[int] = None
     health_access = "ok"
     try:
         health = _fetch_json(args.base_url, "/api/health", args.timeout_s)
         health_ok = bool(health.get("ok", False))
@@ -111,6 +116,30 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
             raise
     auth = health.get("auth") if isinstance(health.get("auth"), dict) else {}
     build_version = str(build_info.get("build_version") or "").strip()
     git_sha = str(build_info.get("git_sha") or "").strip()
     startup_integrity_ok = bool(build_info.get("startup_integrity_ok", False))
@@ -144,6 +173,8 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
         failures.append(
             f"Hosted auth_enabled mismatch: expected {args.expect_auth_enabled}, got {auth_enabled}."
         )
     if audience in FORBIDDEN_AUDIENCE_VALUES:
         failures.append(f"Hosted auth audience still resolves to removed placeholder '{audience}'.")
     if expected_origin:
@@ -183,6 +214,11 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
             "health_ok": health_ok,
             "health_access": health_access,
             "health_status_code": health_status_code,
             "auth_required": auth_required,
             "auth_enabled": auth_enabled,
             "auth_audience": audience,
@@ -204,12 +240,18 @@ def main() -> int:
     parser.add_argument("--expected-git-sha", default="", help="Expected hosted git_sha.")
     parser.add_argument("--expect-auth-required", default="true", help="Expected hosted auth_required value.")
     parser.add_argument("--expect-auth-enabled", default="true", help="Expected hosted auth_enabled value.")
     parser.add_argument("--timeout-s", type=float, default=20.0, help="HTTP timeout per request.")
     parser.add_argument("--out", default="", help="Optional output JSON path.")
     args = parser.parse_args()
     args.expect_auth_required = _parse_bool(args.expect_auth_required)
     args.expect_auth_enabled = _parse_bool(args.expect_auth_enabled)
     try:
         report = _build_report(args)

     health_ok: Optional[bool] = None
     health_status_code: Optional[int] = None
     health_access = "ok"
+    router_catalog_status: Dict[str, Any] = {}
+    router_catalog_loaded: Optional[bool] = None
+    router_catalog_product_count: Optional[int] = None
+    router_catalog_access = "not_checked"
+    router_catalog_status_code: Optional[int] = None
     try:
         health = _fetch_json(args.base_url, "/api/health", args.timeout_s)
         health_ok = bool(health.get("ok", False))
             raise
     auth = health.get("auth") if isinstance(health.get("auth"), dict) else {}
+    if args.require_router_workbook_loaded:
+        router_catalog_access = "ok"
+        try:
+            router_catalog_status = _fetch_json(args.base_url, "/api/rapid_router/catalog/status", args.timeout_s)
+            catalog = router_catalog_status.get("catalog") if isinstance(router_catalog_status.get("catalog"), dict) else {}
+            router_catalog_loaded = bool(catalog.get("loaded", False))
+            try:
+                router_catalog_product_count = int(catalog.get("product_count")) if catalog.get("product_count") is not None else None
+            except Exception:
+                router_catalog_product_count = None
+        except HTTPError as exc:
+            router_catalog_status_code = int(exc.code)
+            router_catalog_access = "protected" if router_catalog_status_code in {401, 403} else "error"
+            failures.append(
+                "Hosted /api/rapid_router/catalog/status could not be validated"
+                f" (HTTP {router_catalog_status_code})."
+            )
+        except (URLError, TimeoutError, json.JSONDecodeError) as exc:
+            router_catalog_access = "error"
+            failures.append(
+                "Hosted /api/rapid_router/catalog/status could not be validated"
+                f" ({type(exc).__name__}: {exc})."
+            )
     build_version = str(build_info.get("build_version") or "").strip()
     git_sha = str(build_info.get("git_sha") or "").strip()
     startup_integrity_ok = bool(build_info.get("startup_integrity_ok", False))
         failures.append(
             f"Hosted auth_enabled mismatch: expected {args.expect_auth_enabled}, got {auth_enabled}."
         )
+    if args.require_router_workbook_loaded and router_catalog_loaded is False:
+        failures.append("Hosted router workbook catalog is not loaded.")
     if audience in FORBIDDEN_AUDIENCE_VALUES:
         failures.append(f"Hosted auth audience still resolves to removed placeholder '{audience}'.")
     if expected_origin:
             "health_ok": health_ok,
             "health_access": health_access,
             "health_status_code": health_status_code,
+            "router_catalog_loaded": router_catalog_loaded,
+            "router_catalog_product_count": router_catalog_product_count,
+            "router_catalog_access": router_catalog_access,
+            "router_catalog_status_code": router_catalog_status_code,
+            "router_catalog_status": router_catalog_status,
             "auth_required": auth_required,
             "auth_enabled": auth_enabled,
             "auth_audience": audience,
     parser.add_argument("--expected-git-sha", default="", help="Expected hosted git_sha.")
     parser.add_argument("--expect-auth-required", default="true", help="Expected hosted auth_required value.")
     parser.add_argument("--expect-auth-enabled", default="true", help="Expected hosted auth_enabled value.")
+    parser.add_argument(
+        "--require-router-workbook-loaded",
+        default="false",
+        help="Whether /api/rapid_router/catalog/status must report catalog.loaded=true.",
+    )
     parser.add_argument("--timeout-s", type=float, default=20.0, help="HTTP timeout per request.")
     parser.add_argument("--out", default="", help="Optional output JSON path.")
     args = parser.parse_args()
     args.expect_auth_required = _parse_bool(args.expect_auth_required)
     args.expect_auth_enabled = _parse_bool(args.expect_auth_enabled)
+    args.require_router_workbook_loaded = _parse_bool(args.require_router_workbook_loaded)
     try:
         report = _build_report(args)

frontend/scripts/run-hosted-smoke.sh CHANGED Viewed

@@ -4,6 +4,8 @@ set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 FRONTEND_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
 ENV_FILE="${E2E_ENV_FILE:-${FRONTEND_DIR}/.env.e2e}"
 PROD_BASE_URL="${E2E_PROD_BASE_URL:-https://crazycrazypete-masters-four-tab-openai.hf.space}"
 CANARY_BASE_URL="${E2E_CANARY_BASE_URL:-https://crazycrazypete-masters-four-tab-openai-canary.hf.space}"
@@ -12,6 +14,7 @@ PROD_POTS_WORKSPACE_EXPECTATION="${E2E_PROD_POTS_WORKSPACE_EXPECTATION:-project-
 CANARY_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_POTS_WORKSPACE_EXPECTATION:-project-shell}"
 CANARY_AB_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_AB_POTS_WORKSPACE_EXPECTATION:-project-shell}"
 TARGETS="${E2E_SMOKE_TARGETS:-production canary canary-ab}"
 if [[ ! -f "${ENV_FILE}" ]]; then
   echo "Missing env file: ${ENV_FILE}" >&2
@@ -27,6 +30,12 @@ run_target() {
   local pots_workspace_expectation="$3"
   echo "==> Hosted smoke: ${label} (${base_url}) [pots-workspace=${pots_workspace_expectation}]"
   E2E_BASE_URL="${base_url}" E2E_POTS_WORKSPACE_EXPECTATION="${pots_workspace_expectation}" npx playwright test \
     e2e/auth.full-flow.spec.ts \
     e2e/pots.provider-coverage.spec.ts \

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 FRONTEND_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+REPO_ROOT="$(cd "${FRONTEND_DIR}/.." && pwd)"
+VALIDATOR="${REPO_ROOT}/backend/scripts/validate_hosted_runtime.py"
 ENV_FILE="${E2E_ENV_FILE:-${FRONTEND_DIR}/.env.e2e}"
 PROD_BASE_URL="${E2E_PROD_BASE_URL:-https://crazycrazypete-masters-four-tab-openai.hf.space}"
 CANARY_BASE_URL="${E2E_CANARY_BASE_URL:-https://crazycrazypete-masters-four-tab-openai-canary.hf.space}"
 CANARY_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_POTS_WORKSPACE_EXPECTATION:-project-shell}"
 CANARY_AB_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_AB_POTS_WORKSPACE_EXPECTATION:-project-shell}"
 TARGETS="${E2E_SMOKE_TARGETS:-production canary canary-ab}"
+VALIDATE_TIMEOUT_S="${E2E_HOSTED_VALIDATE_TIMEOUT_S:-30}"
 if [[ ! -f "${ENV_FILE}" ]]; then
   echo "Missing env file: ${ENV_FILE}" >&2
   local pots_workspace_expectation="$3"
   echo "==> Hosted smoke: ${label} (${base_url}) [pots-workspace=${pots_workspace_expectation}]"
+  python3 "${VALIDATOR}" \
+    --base-url "${base_url}" \
+    --expect-auth-required true \
+    --expect-auth-enabled true \
+    --require-router-workbook-loaded true \
+    --timeout-s "${VALIDATE_TIMEOUT_S}"
   E2E_BASE_URL="${base_url}" E2E_POTS_WORKSPACE_EXPECTATION="${pots_workspace_expectation}" npx playwright test \
     e2e/auth.full-flow.spec.ts \
     e2e/pots.provider-coverage.spec.ts \