Pete Dunn commited on
Commit
ddae266
·
1 Parent(s): 640f404

Harden router canary eval preflight and survey runs

Browse files
.github/workflows/deploy-hf-gated.yml CHANGED
@@ -379,6 +379,7 @@ jobs:
379
  --expected-git-sha "${EXPECTED_SHA}" \
380
  --expect-auth-required true \
381
  --expect-auth-enabled true \
 
382
  --out "docs/evals/canary_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"
383
 
384
  deploy-production:
@@ -514,4 +515,5 @@ jobs:
514
  --expected-git-sha "${EXPECTED_SHA}" \
515
  --expect-auth-required true \
516
  --expect-auth-enabled true \
 
517
  --out "docs/evals/production_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"
 
379
  --expected-git-sha "${EXPECTED_SHA}" \
380
  --expect-auth-required true \
381
  --expect-auth-enabled true \
382
+ --require-router-workbook-loaded true \
383
  --out "docs/evals/canary_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"
384
 
385
  deploy-production:
 
515
  --expected-git-sha "${EXPECTED_SHA}" \
516
  --expect-auth-required true \
517
  --expect-auth-enabled true \
518
+ --require-router-workbook-loaded true \
519
  --out "docs/evals/production_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"
backend/app/knowledgebase/core.py CHANGED
@@ -1695,6 +1695,52 @@ def _looks_like_router_lifecycle(message: str) -> bool:
1695
  return False
1696
 
1697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1698
  def _looks_like_pots(message: str) -> bool:
1699
  low = _normalize_router_query_text(message)
1700
  if _contains_any(low, _POTS_HINTS):
@@ -8811,8 +8857,20 @@ class UnifiedKnowledgebaseCore:
8811
  continue
8812
  seen_requested_compacts.add(compact)
8813
  requested_compare_labels.append(label)
 
 
 
 
 
 
 
 
8814
  asks_install_caveats = any(h in low for h in ("install caveat", "install caveats"))
8815
  if asks_install_caveats:
 
 
 
 
8816
  compare_field_labels = {
8817
  "wan_lan": "WAN/LAN ports",
8818
  "antennas_rf": "RF connectors",
@@ -8842,13 +8900,108 @@ class UnifiedKnowledgebaseCore:
8842
  elif "install_caveats" not in requested_compare_fields:
8843
  requested_compare_fields.append("install_caveats")
8844
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8845
  def _catalog_compare_value(row: Dict[str, Any], field: str) -> str:
8846
  raw = _norm(row.get(field, ""))
8847
  if (not raw) and field == "install_caveats":
8848
  raw = _norm(row.get("special notes", ""))
8849
- if not raw:
8850
- return "Not clearly documented"
8851
- return _truncate(_fix_common_mojibake(raw), 170)
8852
 
8853
  def _catalog_install_compare_table() -> Optional[Dict[str, Any]]:
8854
  doc_rows: List[Dict[str, str]] = []
@@ -9140,14 +9293,6 @@ class UnifiedKnowledgebaseCore:
9140
  if catalog_table is not None:
9141
  return catalog_table
9142
  return None
9143
- wants_doc_matrix = (
9144
- ("documented" in low and "not documented" in low)
9145
- or ("include what is documented" in low)
9146
- or ("what is documented vs not documented" in low)
9147
- )
9148
- wants_docs_only = wants_doc_matrix or any(
9149
- h in low for h in ("from documented specs only", "documented specs only", "from docs only", "docs only")
9150
- )
9151
  deterministic_doc_matrix_supported = bool(
9152
  wants_doc_matrix
9153
  and len(dedup_models) >= 2
@@ -9185,6 +9330,31 @@ class UnifiedKnowledgebaseCore:
9185
  if not value:
9186
  return "Not clearly documented"
9187
  low_value = value.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9188
  if any(token in low_value for token in ("not listed", "abstained", "unknown", "csv conflict", "(blank)")):
9189
  return "Not clearly documented"
9190
  if field_name == "wan_lan":
@@ -9206,6 +9376,8 @@ class UnifiedKnowledgebaseCore:
9206
  return "Not clearly documented"
9207
  return _truncate(re.sub(r"\s+", " ", value).strip(), 140)
9208
  if field_name == "antennas_rf":
 
 
9209
  if "rf:" in value.lower():
9210
  value = value.split("RF:", 1)[-1]
9211
  connector_match = re.search(
@@ -9237,6 +9409,8 @@ class UnifiedKnowledgebaseCore:
9237
  return "Not clearly documented"
9238
  return _truncate(value, 120)
9239
  if field_name == "modem":
 
 
9240
  modem_match = re.search(
9241
  r"\b(?:5g(?:\s*nr)?(?:\s*sa)?(?:\s*/\s*nsa)?|4g(?:\s*lte)?(?:\s*cat\s*\d+)?|lte\s*cat\s*\d+)\b[^.;,]{0,50}",
9242
  value,
@@ -12488,6 +12662,51 @@ class UnifiedKnowledgebaseCore:
12488
  return "Listed, but family not explicit"
12489
  return "Needs connector validation"
12490
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12491
  def _vehicle_compare_reason(row: Dict[str, Any]) -> str:
12492
  use_case = _norm(row.get("primary_use_case", ""))
12493
  rugged = _norm(row.get("ruggedization", ""))
@@ -12663,7 +12882,7 @@ class UnifiedKnowledgebaseCore:
12663
  sources: List[Dict[str, Any]] = []
12664
  for idx, (display, row, antenna_family, why_fit) in enumerate(compare_rows, start=1):
12665
  wan_lan = _norm(row.get("wan_lan", "")) or "Not listed"
12666
- rf = _norm(row.get("antennas_rf", "")) or "Not listed"
12667
  lines.append(
12668
  f"| {_md_cell(display)} | {_md_cell(_norm(row.get('primary_use_case', '')) or 'Vehicle/mobile signal reviewed')} "
12669
  f"| {_md_cell(wan_lan)} | {_md_cell(rf)} | {_md_cell(antenna_family)} | {_md_cell(why_fit)} |"
@@ -12810,7 +13029,7 @@ class UnifiedKnowledgebaseCore:
12810
  modem = _norm(row.get("modem", "")) or "Not listed (abstained)"
12811
  rugged = _norm(row.get("ruggedization", "")) or "Not listed (abstained)"
12812
  battery = _norm(row.get("battery", "")) or "Not listed (abstained)"
12813
- rf = _norm(row.get("antennas_rf", "")) or "Not listed (abstained)"
12814
  lines.append(
12815
  f"| {idx} | {_md_cell(model_name)} | {_md_cell(modem)} | {_md_cell(rugged)} | "
12816
  f"{_md_cell(battery)} | {_md_cell(rf)} | {_md_cell(note)} |"
@@ -29329,9 +29548,19 @@ class UnifiedKnowledgebaseCore:
29329
  ]
29330
  ranking_rows: List[Tuple[int, int, str]] = []
29331
 
 
 
 
 
 
 
 
 
 
 
29332
  def _fleet_priority(item: Dict[str, Any], matched: Dict[str, Any]) -> Tuple[int, str]:
29333
  qty = int(item.get("qty") or 0)
29334
- router_name = str(item.get("model_display") or item.get("product_text") or "Unknown")
29335
  if not matched:
29336
  return 150 + min(qty, 25), (
29337
  f"`{router_name}` needs an exact workbook model match before I can rank its migration path safely."
@@ -29373,7 +29602,7 @@ class UnifiedKnowledgebaseCore:
29373
  reasons.append("authoritative lifecycle dates are incomplete")
29374
 
29375
  reason_text = ", and ".join(reasons[:3])
29376
- return score, f"`{_display_name(match) or router_name}` should be prioritized because {reason_text}."
29377
 
29378
  asks_ranked_output = any(
29379
  token in query.normalized_message
@@ -29393,14 +29622,14 @@ class UnifiedKnowledgebaseCore:
29393
  status = _status_label(match, lifecycle)
29394
  eos = lifecycle.get("end_of_sale_date") or "Not listed"
29395
  eol = lifecycle.get("last_support_date") or "Not listed"
29396
- router_name = _display_name(match) or str(item.get("model_display") or item.get("product_text") or "Unknown")
29397
  else:
29398
  same_brand = "Needs exact workbook match"
29399
  backup = "Needs exact workbook match"
29400
  status = "Needs exact workbook match"
29401
  eos = "Not listed"
29402
  eol = "Not listed"
29403
- router_name = str(item.get("model_display") or item.get("product_text") or "Unknown")
29404
  rows.append(
29405
  "| "
29406
  + " | ".join(
@@ -31813,7 +32042,7 @@ class UnifiedKnowledgebaseCore:
31813
  blocked["meta"] = blocked_meta
31814
  return blocked
31815
  lifecycle_policy_text = _scrub_router_model_tokens_for_policy(msg)
31816
- if _EXACT_LIFECYCLE_RE.search(lifecycle_policy_text):
31817
  blocked = self._policy_block_response("exact_lifecycle", st)
31818
  blocked_meta = _as_dict(blocked.get("meta"))
31819
  blocked_meta["timing_ms"] = {"total": round((time.perf_counter() - t_total) * 1000.0, 2)}
 
1695
  return False
1696
 
1697
 
1698
+ def _is_supported_router_mixed_lifecycle_request(message: str) -> bool:
1699
+ normalized = _normalize_router_query_text(message)
1700
+ query = parse_router_intelligence_query(message)
1701
+ if query is None or query.intent not in {"details", "compare"}:
1702
+ return False
1703
+ exact_date_tokens = (
1704
+ "exact lifecycle date",
1705
+ "lifecycle date",
1706
+ "end of sale date",
1707
+ "end of life date",
1708
+ "eos date",
1709
+ "eol date",
1710
+ "what exact date",
1711
+ )
1712
+ if any(token in normalized for token in exact_date_tokens):
1713
+ return False
1714
+ safe_field_tokens = (
1715
+ "primary use case",
1716
+ "use case",
1717
+ "wan/lan",
1718
+ "wan",
1719
+ "lan",
1720
+ "ethernet",
1721
+ "ports",
1722
+ "wi-fi",
1723
+ "wifi",
1724
+ "modem",
1725
+ "rf connector",
1726
+ "rf connectors",
1727
+ "connectors",
1728
+ "install caveat",
1729
+ "install caveats",
1730
+ "current recommendation",
1731
+ "still current recommendation",
1732
+ "status",
1733
+ "documented vs",
1734
+ "not documented",
1735
+ "compare",
1736
+ "difference",
1737
+ "different",
1738
+ "versus",
1739
+ " vs ",
1740
+ )
1741
+ return any(token in normalized for token in safe_field_tokens)
1742
+
1743
+
1744
  def _looks_like_pots(message: str) -> bool:
1745
  low = _normalize_router_query_text(message)
1746
  if _contains_any(low, _POTS_HINTS):
 
8857
  continue
8858
  seen_requested_compacts.add(compact)
8859
  requested_compare_labels.append(label)
8860
+ wants_doc_matrix = (
8861
+ ("documented" in low and "not documented" in low)
8862
+ or ("include what is documented" in low)
8863
+ or ("what is documented vs not documented" in low)
8864
+ )
8865
+ wants_docs_only = wants_doc_matrix or any(
8866
+ h in low for h in ("from documented specs only", "documented specs only", "from docs only", "docs only")
8867
+ )
8868
  asks_install_caveats = any(h in low for h in ("install caveat", "install caveats"))
8869
  if asks_install_caveats:
8870
+ # This helper only runs in the router_docs lane, so install-caveat
8871
+ # compare prompts should still stay source-bounded even when the
8872
+ # user omits an explicit "docs only" phrase.
8873
+ wants_docs_only = True
8874
  compare_field_labels = {
8875
  "wan_lan": "WAN/LAN ports",
8876
  "antennas_rf": "RF connectors",
 
8900
  elif "install_caveats" not in requested_compare_fields:
8901
  requested_compare_fields.append("install_caveats")
8902
 
8903
+ variant_ambiguity_markers = (
8904
+ "vary by sku",
8905
+ "varies by sku",
8906
+ "depends on package",
8907
+ "depends on package or accessory",
8908
+ "exact modem depends",
8909
+ "package or accessory",
8910
+ "supported modem option",
8911
+ "supported modem options",
8912
+ "supported modem sku",
8913
+ "supported modem skus",
8914
+ "modem-equipped variant",
8915
+ "modem-equipped variants",
8916
+ "no-modem",
8917
+ )
8918
+
8919
+ def _has_variant_ambiguity(value: str) -> bool:
8920
+ low_value = value.lower()
8921
+ return any(marker in low_value for marker in variant_ambiguity_markers)
8922
+
8923
+ def _sanitize_catalog_install_compare_value(field: str, raw_value: Any) -> str:
8924
+ value = _fix_common_mojibake(_norm(raw_value))
8925
+ if not value:
8926
+ return "Not clearly documented"
8927
+ low_value = value.lower()
8928
+ if field == "wan_lan":
8929
+ port_match = re.search(
8930
+ r"\bfive ethernet ports?\b|\bdual ethernet ports?\b|\b\d+\s*(?:x|\*)\s*(?:10/100/1000|10/100|100mbit/s|100mbps|1gbe|2\.5gbe|ge)\s*(?:rj45\s*)?(?:ethernet|network)\s*ports?\b(?:[^.;]{0,40}\b(?:wan|lan|vlan)\b[^.;]{0,40})?",
8931
+ value,
8932
+ flags=re.IGNORECASE,
8933
+ )
8934
+ if port_match:
8935
+ value = port_match.group(0)
8936
+ low_value = value.lower()
8937
+ has_port_signal = bool(
8938
+ re.search(r"\b(?:wan|lan|ethernet|rj45|sfp\+?)\b", low_value)
8939
+ or re.search(r"\b\d+\s*x\b", low_value)
8940
+ or re.search(r"\b\d+(?:\.\d+)?\s*(?:gbe?|mbps|gbps)\b", low_value)
8941
+ or re.search(r"\b(single|dual|triple|quad|five)\b", low_value)
8942
+ )
8943
+ if not has_port_signal:
8944
+ return "Not clearly documented"
8945
+ return _truncate(re.sub(r"\s+", " ", value).strip(), 140)
8946
+ if field == "antennas_rf":
8947
+ if _has_variant_ambiguity(value):
8948
+ return "Needs exact SKU/package; connector path varies across documented family variants."
8949
+ connector_match = re.search(
8950
+ r"(?:\d+\s*x\s*)?(?:rp-?sma|sma)\b[^.;]{0,80}|antenna connectors?[^.;]{0,100}|(?:gps|gnss)\b[^.;]{0,80}",
8951
+ value,
8952
+ flags=re.IGNORECASE,
8953
+ )
8954
+ if not connector_match:
8955
+ return "Not clearly documented"
8956
+ value = connector_match.group(0)
8957
+ value = re.sub(r"^(?:both|external|internal)\s*\([^)]*\)\s*;?\s*", "", value, flags=re.IGNORECASE)
8958
+ value = re.sub(r"\([^)]*(?:depends on|if present|verify [^)]+|see [^)]+ docs)[^)]*\)", "", value, flags=re.IGNORECASE)
8959
+ value = re.sub(r"[.;]\s*Adapter pigtails?:[^.;]*", "", value, flags=re.IGNORECASE)
8960
+ value = re.sub(r"[.;]\s*connectors likely[^.;]*", "", value, flags=re.IGNORECASE)
8961
+ value = re.sub(r"[.;]\s*verify connector type[^.;]*", "", value, flags=re.IGNORECASE)
8962
+ value = re.sub(r"[.;]\s*(SIM|Ethernet):[^.;]*", "", value, flags=re.IGNORECASE)
8963
+ value = re.sub(r"\s+", " ", value).strip(" ;.")
8964
+ low_value = value.lower()
8965
+ if (not value) or (not any(token in low_value for token in ("sma", "rp-sma", "connector", "gnss", "gps"))):
8966
+ return "Not clearly documented"
8967
+ return _truncate(value, 140)
8968
+ if field == "modem":
8969
+ if _has_variant_ambiguity(value):
8970
+ return "Needs exact SKU/package; modem path varies across documented family variants."
8971
+ modem_match = re.search(
8972
+ r"\b(?:5g(?:\s*nr)?(?:\s*sa)?(?:\s*/\s*nsa)?|4g(?:\s*lte)?(?:\s*cat\s*\d+)?|lte\s*cat\s*\d+)\b[^.;,]{0,50}",
8973
+ value,
8974
+ flags=re.IGNORECASE,
8975
+ )
8976
+ if not modem_match:
8977
+ return "Not clearly documented"
8978
+ value = modem_match.group(0)
8979
+ low_value = value.lower()
8980
+ if low_value in {"modem", "modems", "cellular", "cellular modem"}:
8981
+ return "Not clearly documented"
8982
+ if any(token in low_value for token in ("ethernet", "wan", "lan", "cloud management", "ports", "wi-fi", "vpn", "connectivity", "secure")):
8983
+ return "Not clearly documented"
8984
+ return _truncate(value, 120)
8985
+ if field == "wifi":
8986
+ if any(token in low_value for token in ("none", "no wi-fi", "no wifi", "without wi-fi", "without wifi")):
8987
+ return "None"
8988
+ if ("802.11" not in low_value) and (not re.search(r"\bwi-?fi\s*[4567]\b", low_value)):
8989
+ return "Not clearly documented"
8990
+ return _truncate(value, 120)
8991
+ if field == "battery":
8992
+ if low_value == "none":
8993
+ return "None"
8994
+ return _truncate(value, 120)
8995
+ if field == "install_caveats":
8996
+ if _has_variant_ambiguity(value) or any(token in low_value for token in ("verify exact", "exact sku", "exact modem", "exact package")):
8997
+ return "Verify exact SKU/package before finalizing modem, RF, or accessory assumptions."
8998
+ return _truncate(value, 170)
8999
+
9000
  def _catalog_compare_value(row: Dict[str, Any], field: str) -> str:
9001
  raw = _norm(row.get(field, ""))
9002
  if (not raw) and field == "install_caveats":
9003
  raw = _norm(row.get("special notes", ""))
9004
+ return _sanitize_catalog_install_compare_value(field, raw)
 
 
9005
 
9006
  def _catalog_install_compare_table() -> Optional[Dict[str, Any]]:
9007
  doc_rows: List[Dict[str, str]] = []
 
9293
  if catalog_table is not None:
9294
  return catalog_table
9295
  return None
 
 
 
 
 
 
 
 
9296
  deterministic_doc_matrix_supported = bool(
9297
  wants_doc_matrix
9298
  and len(dedup_models) >= 2
 
9330
  if not value:
9331
  return "Not clearly documented"
9332
  low_value = value.lower()
9333
+ variant_ambiguity_markers = (
9334
+ "variant",
9335
+ "variants",
9336
+ "exact sku",
9337
+ "exact package",
9338
+ "exact modem",
9339
+ "modem option",
9340
+ "modem options",
9341
+ "sku/package",
9342
+ "sku package",
9343
+ "depends on sku",
9344
+ "depends on package",
9345
+ "see model-specific docs",
9346
+ "see exact sku docs",
9347
+ "supported modem sku",
9348
+ "supported modem skus",
9349
+ "modem-equipped variant",
9350
+ "modem-equipped variants",
9351
+ "no-modem",
9352
+ )
9353
+
9354
+ def _has_variant_ambiguity_local(text: str) -> bool:
9355
+ local_low = text.lower()
9356
+ return any(marker in local_low for marker in variant_ambiguity_markers)
9357
+
9358
  if any(token in low_value for token in ("not listed", "abstained", "unknown", "csv conflict", "(blank)")):
9359
  return "Not clearly documented"
9360
  if field_name == "wan_lan":
 
9376
  return "Not clearly documented"
9377
  return _truncate(re.sub(r"\s+", " ", value).strip(), 140)
9378
  if field_name == "antennas_rf":
9379
+ if _has_variant_ambiguity_local(value):
9380
+ return "Needs exact SKU/package; connector path varies across documented family variants."
9381
  if "rf:" in value.lower():
9382
  value = value.split("RF:", 1)[-1]
9383
  connector_match = re.search(
 
9409
  return "Not clearly documented"
9410
  return _truncate(value, 120)
9411
  if field_name == "modem":
9412
+ if _has_variant_ambiguity_local(value):
9413
+ return "Needs exact SKU/package; modem path varies across documented family variants."
9414
  modem_match = re.search(
9415
  r"\b(?:5g(?:\s*nr)?(?:\s*sa)?(?:\s*/\s*nsa)?|4g(?:\s*lte)?(?:\s*cat\s*\d+)?|lte\s*cat\s*\d+)\b[^.;,]{0,50}",
9416
  value,
 
12662
  return "Listed, but family not explicit"
12663
  return "Needs connector validation"
12664
 
12665
+ def _vehicle_compare_connector_summary(row: Dict[str, Any]) -> str:
12666
+ value = _fix_common_mojibake(_norm(row.get("antennas_rf", "")))
12667
+ if not value:
12668
+ return "Not clearly documented"
12669
+ low_value = value.lower()
12670
+ clauses: List[str] = []
12671
+
12672
+ def _remember(text: str) -> None:
12673
+ cleaned = re.sub(r"\s+", " ", _norm(text)).strip(" ;,.")
12674
+ if not cleaned:
12675
+ return
12676
+ if cleaned.lower() in {item.lower() for item in clauses}:
12677
+ return
12678
+ clauses.append(cleaned)
12679
+
12680
+ for match in re.finditer(
12681
+ r"\b\d+\s*x\s*(?:sma|rp-?sma)\s*(?:cellular|wi-?fi|gnss|gps)?(?:\s+connectors?)?\b",
12682
+ value,
12683
+ flags=re.IGNORECASE,
12684
+ ):
12685
+ _remember(match.group(0))
12686
+ for match in re.finditer(
12687
+ r"\b(?:external\s+)?(?:cellular\s+sma connectors?|reverse-?sma wi-?fi connectors?|sma rf connectors?|wi-?fi variant uses rp-?sma)\b",
12688
+ value,
12689
+ flags=re.IGNORECASE,
12690
+ ):
12691
+ _remember(match.group(0))
12692
+ for group in re.findall(r"\(([^)]*(?:sma|rp-?sma|gps|gnss)[^)]*)\)", value, flags=re.IGNORECASE):
12693
+ for part in re.split(r"[;,]", group):
12694
+ if any(token in part.lower() for token in ("typical", "adapter", "pigtail")):
12695
+ continue
12696
+ if re.search(r"\b(?:sma|rp-?sma|gps|gnss)\b", part, flags=re.IGNORECASE):
12697
+ _remember(part)
12698
+ if not clauses and any(token in low_value for token in ("sma", "rp-sma", "gps", "gnss", "connector")):
12699
+ truncated = re.split(r"[.;]\s*Adapter pigtails?:", value, maxsplit=1, flags=re.IGNORECASE)[0]
12700
+ truncated = re.sub(r"\bCellular:\s*4x4 MIMO on SMA\b", "", truncated, flags=re.IGNORECASE)
12701
+ truncated = re.sub(r"\bWi-?Fi(?:\s*\(if present\))?\s+on\s+RP-SMA\b", "", truncated, flags=re.IGNORECASE)
12702
+ truncated = re.sub(r"\bGNSS on SMA\b", "", truncated, flags=re.IGNORECASE)
12703
+ truncated = re.sub(r"\s+", " ", truncated).strip(" ;,.")
12704
+ if truncated:
12705
+ _remember(truncated)
12706
+ if not clauses:
12707
+ return "Not clearly documented"
12708
+ return _truncate("; ".join(clauses), 140)
12709
+
12710
  def _vehicle_compare_reason(row: Dict[str, Any]) -> str:
12711
  use_case = _norm(row.get("primary_use_case", ""))
12712
  rugged = _norm(row.get("ruggedization", ""))
 
12882
  sources: List[Dict[str, Any]] = []
12883
  for idx, (display, row, antenna_family, why_fit) in enumerate(compare_rows, start=1):
12884
  wan_lan = _norm(row.get("wan_lan", "")) or "Not listed"
12885
+ rf = _vehicle_compare_connector_summary(row)
12886
  lines.append(
12887
  f"| {_md_cell(display)} | {_md_cell(_norm(row.get('primary_use_case', '')) or 'Vehicle/mobile signal reviewed')} "
12888
  f"| {_md_cell(wan_lan)} | {_md_cell(rf)} | {_md_cell(antenna_family)} | {_md_cell(why_fit)} |"
 
13029
  modem = _norm(row.get("modem", "")) or "Not listed (abstained)"
13030
  rugged = _norm(row.get("ruggedization", "")) or "Not listed (abstained)"
13031
  battery = _norm(row.get("battery", "")) or "Not listed (abstained)"
13032
+ rf = _vehicle_compare_connector_summary(row)
13033
  lines.append(
13034
  f"| {idx} | {_md_cell(model_name)} | {_md_cell(modem)} | {_md_cell(rugged)} | "
13035
  f"{_md_cell(battery)} | {_md_cell(rf)} | {_md_cell(note)} |"
 
29548
  ]
29549
  ranking_rows: List[Tuple[int, int, str]] = []
29550
 
29551
+ def _fleet_router_name(item: Dict[str, Any], matched: Dict[str, Any]) -> str:
29552
+ requested_label = _norm(item.get("model_display") or item.get("product_text") or "")
29553
+ match = _as_dict(matched.get("match"))
29554
+ product_key = _norm(match.get("product_key") or "")
29555
+ if requested_label and product_key:
29556
+ if _compact_model(requested_label) == _compact_model(product_key):
29557
+ return requested_label
29558
+ return product_key
29559
+ return product_key or requested_label or _display_name(match) or "Unknown"
29560
+
29561
  def _fleet_priority(item: Dict[str, Any], matched: Dict[str, Any]) -> Tuple[int, str]:
29562
  qty = int(item.get("qty") or 0)
29563
+ router_name = _fleet_router_name(item, matched)
29564
  if not matched:
29565
  return 150 + min(qty, 25), (
29566
  f"`{router_name}` needs an exact workbook model match before I can rank its migration path safely."
 
29602
  reasons.append("authoritative lifecycle dates are incomplete")
29603
 
29604
  reason_text = ", and ".join(reasons[:3])
29605
+ return score, f"`{router_name}` should be prioritized because {reason_text}."
29606
 
29607
  asks_ranked_output = any(
29608
  token in query.normalized_message
 
29622
  status = _status_label(match, lifecycle)
29623
  eos = lifecycle.get("end_of_sale_date") or "Not listed"
29624
  eol = lifecycle.get("last_support_date") or "Not listed"
29625
+ router_name = _fleet_router_name(item, matched)
29626
  else:
29627
  same_brand = "Needs exact workbook match"
29628
  backup = "Needs exact workbook match"
29629
  status = "Needs exact workbook match"
29630
  eos = "Not listed"
29631
  eol = "Not listed"
29632
+ router_name = _fleet_router_name(item, matched)
29633
  rows.append(
29634
  "| "
29635
  + " | ".join(
 
32042
  blocked["meta"] = blocked_meta
32043
  return blocked
32044
  lifecycle_policy_text = _scrub_router_model_tokens_for_policy(msg)
32045
+ if _EXACT_LIFECYCLE_RE.search(lifecycle_policy_text) and (not _is_supported_router_mixed_lifecycle_request(msg)):
32046
  blocked = self._policy_block_response("exact_lifecycle", st)
32047
  blocked_meta = _as_dict(blocked.get("meta"))
32048
  blocked_meta["timing_ms"] = {"total": round((time.perf_counter() - t_total) * 1000.0, 2)}
backend/app/main.py CHANGED
@@ -1706,16 +1706,19 @@ _RAPID_ROUTER_CATALOG_WORKBOOK_PATTERNS: Tuple[str, ...] = (
1706
 
1707
 
1708
  def _resolve_rapid_router_catalog_workbook_path() -> Optional[Path]:
1709
- search_roots: List[Path] = [
1710
  Path.cwd(),
1711
  Path.cwd() / "backend",
 
1712
  Path.home() / "Downloads",
1713
  Path.home() / "Library" / "CloudStorage" / "Dropbox" / "Mac" / "Downloads",
1714
  Path("/data"),
 
1715
  Path("/tmp"),
 
1716
  ]
1717
  seen: set[str] = set()
1718
- for root in search_roots:
1719
  root_key = str(root)
1720
  if root_key in seen or not root.exists() or not root.is_dir():
1721
  continue
@@ -1724,6 +1727,26 @@ def _resolve_rapid_router_catalog_workbook_path() -> Optional[Path]:
1724
  for candidate in sorted(root.glob(pattern)):
1725
  if candidate.is_file():
1726
  return candidate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1727
  return None
1728
 
1729
 
 
1706
 
1707
 
1708
  def _resolve_rapid_router_catalog_workbook_path() -> Optional[Path]:
1709
+ direct_search_roots: List[Path] = [
1710
  Path.cwd(),
1711
  Path.cwd() / "backend",
1712
+ Path.cwd() / "backend" / "data",
1713
  Path.home() / "Downloads",
1714
  Path.home() / "Library" / "CloudStorage" / "Dropbox" / "Mac" / "Downloads",
1715
  Path("/data"),
1716
+ Path("/data/rapid_router"),
1717
  Path("/tmp"),
1718
+ _resolve_rapid_router_storage_dir(),
1719
  ]
1720
  seen: set[str] = set()
1721
+ for root in direct_search_roots:
1722
  root_key = str(root)
1723
  if root_key in seen or not root.exists() or not root.is_dir():
1724
  continue
 
1727
  for candidate in sorted(root.glob(pattern)):
1728
  if candidate.is_file():
1729
  return candidate
1730
+ recursive_search_roots: List[Path] = [
1731
+ Path.cwd() / "backend" / "data",
1732
+ Path("/data"),
1733
+ Path("/data/rapid_router"),
1734
+ _resolve_rapid_router_storage_dir(),
1735
+ Path("/tmp"),
1736
+ ]
1737
+ for root in recursive_search_roots:
1738
+ root_key = f"recursive:{root}"
1739
+ if root_key in seen or not root.exists() or not root.is_dir():
1740
+ continue
1741
+ seen.add(root_key)
1742
+ for pattern in _RAPID_ROUTER_CATALOG_WORKBOOK_PATTERNS:
1743
+ for candidate in sorted(root.rglob(pattern)):
1744
+ try:
1745
+ depth = len(candidate.relative_to(root).parts)
1746
+ except Exception:
1747
+ depth = 999
1748
+ if candidate.is_file() and depth <= 4:
1749
+ return candidate
1750
  return None
1751
 
1752
 
backend/app/test_rapid_router_catalog_bootstrap.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import app.main as main
6
+
7
+
8
+ def test_resolve_rapid_router_catalog_workbook_path_finds_nested_backend_data_workbook(
9
+ tmp_path: Path, monkeypatch
10
+ ) -> None:
11
+ nested = tmp_path / "backend" / "data" / "rapid_router" / "imports"
12
+ nested.mkdir(parents=True)
13
+ workbook = nested / "device_master_source_of_truth_v26_site_survey_integrated_export.xlsx"
14
+ workbook.write_bytes(b"placeholder")
15
+ fake_home = tmp_path / "fake_home"
16
+ (fake_home / "Downloads").mkdir(parents=True)
17
+
18
+ monkeypatch.chdir(tmp_path)
19
+ monkeypatch.setattr(main, "_resolve_rapid_router_storage_dir", lambda: tmp_path / "storage")
20
+ monkeypatch.setattr(main.Path, "home", classmethod(lambda cls: fake_home))
21
+
22
+ resolved = main._resolve_rapid_router_catalog_workbook_path()
23
+
24
+ assert resolved == workbook
backend/app/test_score_router_canary_ab_responses.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import importlib.util
4
+ from pathlib import Path
5
+
6
+
7
+ def _load_score_module():
8
+ script_path = Path(__file__).resolve().parents[1] / "scripts" / "score_router_canary_ab_responses.py"
9
+ spec = importlib.util.spec_from_file_location("score_router_canary_ab_responses", script_path)
10
+ assert spec is not None and spec.loader is not None
11
+ module = importlib.util.module_from_spec(spec)
12
+ spec.loader.exec_module(module)
13
+ return module
14
+
15
+
16
+ def test_normalize_grade_payload_scales_fractional_scores_to_percent() -> None:
17
+ module = _load_score_module()
18
+
19
+ out = module._normalize_grade_payload(
20
+ {
21
+ "fact_score": 0.64,
22
+ "instruction_score": 0.81,
23
+ "coverage_score": 0.52,
24
+ "readability_score": 0.9,
25
+ "safety_score": 1.0,
26
+ "overall_score": 0.74,
27
+ "issues": ["ambiguous_specs"],
28
+ "rationale": "fractional payload",
29
+ }
30
+ )
31
+
32
+ assert out["fact_score"] == 64
33
+ assert out["instruction_score"] == 81
34
+ assert out["coverage_score"] == 52
35
+ assert out["readability_score"] == 90
36
+ assert out["safety_score"] == 100
37
+ assert out["overall_score"] == 74
38
+ assert out["grade"] == "C"
39
+
40
+
41
+ def test_normalize_grade_payload_scales_ten_point_scores_to_percent() -> None:
42
+ module = _load_score_module()
43
+
44
+ out = module._normalize_grade_payload(
45
+ {
46
+ "fact_score": 7,
47
+ "instruction_score": 8.5,
48
+ "coverage_score": 6,
49
+ "readability_score": 9,
50
+ "safety_score": 10,
51
+ }
52
+ )
53
+
54
+ assert out["fact_score"] == 70
55
+ assert out["instruction_score"] == 85
56
+ assert out["coverage_score"] == 60
57
+ assert out["readability_score"] == 90
58
+ assert out["safety_score"] == 100
59
+ assert out["overall_score"] == 81
60
+ assert out["grade"] == "B"
61
+
62
+
63
+ def test_normalize_grade_payload_preserves_percent_scale_and_percent_strings() -> None:
64
+ module = _load_score_module()
65
+
66
+ out = module._normalize_grade_payload(
67
+ {
68
+ "fact_score": "64%",
69
+ "instruction_score": "81",
70
+ "coverage_score": 52,
71
+ "readability_score": 90,
72
+ "safety_score": "100%",
73
+ "overall_score": 74,
74
+ }
75
+ )
76
+
77
+ assert out["fact_score"] == 64
78
+ assert out["instruction_score"] == 81
79
+ assert out["coverage_score"] == 52
80
+ assert out["readability_score"] == 90
81
+ assert out["safety_score"] == 100
82
+ assert out["overall_score"] == 74
backend/app/test_unified_kb_core.py CHANGED
@@ -4113,6 +4113,24 @@ def test_unified_kb_router_install_caveat_compare_defers_to_router_docs_delegate
4113
  assert "Router docs answer." in str(out.get("assistant") or "")
4114
 
4115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4116
  def test_unified_kb_router_weight_compare_skips_alias_clarification() -> None:
4117
  core = build_core()
4118
  out = core.handle_message(
@@ -4239,6 +4257,21 @@ def test_unified_kb_docs_only_three_model_compare_defers_to_router_docs() -> Non
4239
  assert "Lifecycle note:" not in assistant
4240
 
4241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4242
  def test_unified_kb_docs_only_alias_pair_compare_keeps_side_by_side_fast() -> None:
4243
  core = build_core_with(router_core=RepoCsvRouterCore())
4244
  out = core.handle_message(
@@ -4405,6 +4438,54 @@ def test_unified_kb_router_decision_table_compare_defers_to_router_docs() -> Non
4405
  assert "| MAXBR1PRO |" not in assistant
4406
 
4407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4408
  def test_unified_kb_router_decision_table_does_not_default_husky_when_family_is_missing(tmp_path: Path) -> None:
4409
  dec_path = tmp_path / "feb2026routers.csv"
4410
  dec_path.write_text(
@@ -4454,6 +4535,20 @@ def test_unified_kb_br1_connector_compare_uses_deterministic_compare_lane() -> N
4454
  assert "Lifecycle note:" not in assistant
4455
 
4456
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4457
  def test_unified_kb_lifecycle_prompt_does_not_invent_given300_model() -> None:
4458
  core = build_core()
4459
  out = core.handle_message(
 
4113
  assert "Router docs answer." in str(out.get("assistant") or "")
4114
 
4115
 
4116
+ def test_unified_kb_docs_only_install_compare_keeps_variant_family_rows_conservative() -> None:
4117
+ core = build_core_with(router_core=RepoCsvRouterCore())
4118
+ out = core.handle_message(
4119
+ "For AER2200 and AER1600, summarize WAN/LAN, RF connectors, modem variants, and install caveats in one table from docs only.",
4120
+ {},
4121
+ mode="auto",
4122
+ audience="auto",
4123
+ show_citations=True,
4124
+ )
4125
+ assert out["meta"]["domain"] == "router_docs"
4126
+ assert out["meta"]["retrieval_mode"] == "router_multi_model_doc_caveat_table_fast"
4127
+ assistant = str(out.get("assistant") or "")
4128
+ assert "Needs exact SKU/package; connector path varies across documented family variants." in assistant
4129
+ assert "Needs exact SKU/package; modem path varies across documented family variants." in assistant
4130
+ assert "Cat 4 / Cat 6 / LTE Advanced Pro bundles" not in assistant
4131
+ assert "external reverse-SMA Wi-Fi connectors" not in assistant
4132
+
4133
+
4134
  def test_unified_kb_router_weight_compare_skips_alias_clarification() -> None:
4135
  core = build_core()
4136
  out = core.handle_message(
 
4257
  assert "Lifecycle note:" not in assistant
4258
 
4259
 
4260
+ def test_unified_kb_docs_only_mg_eval_prompt_stays_documented_and_conservative() -> None:
4261
+ core = build_core_with(router_core=RepoCsvRouterCore(), router_rag_core=StubRouterRagCompareDocs())
4262
+ out = core._router_multi_model_doc_table_fast(
4263
+ "Compare MG51 vs MG52 vs MG52E and show only meaningful differences, including what is documented vs not documented."
4264
+ )
4265
+ assert out is not None
4266
+ assert out["meta"]["domain"] == "router_docs"
4267
+ assert out["meta"]["retrieval_mode"] == "router_docs_documented_matrix_fast"
4268
+ assistant = str(out.get("assistant") or "")
4269
+ assert "Documented vs not-documented comparison" in assistant
4270
+ assert "Not documented" in assistant
4271
+ assert "Adapter pigtails" not in assistant
4272
+ assert "Lifecycle note:" not in assistant
4273
+
4274
+
4275
  def test_unified_kb_docs_only_alias_pair_compare_keeps_side_by_side_fast() -> None:
4276
  core = build_core_with(router_core=RepoCsvRouterCore())
4277
  out = core.handle_message(
 
4438
  assert "| MAXBR1PRO |" not in assistant
4439
 
4440
 
4441
+ def test_unified_kb_router_decision_table_keeps_rf_guidance_conservative() -> None:
4442
+ core = build_core_with(router_core=RepoCsvRouterCore())
4443
+ out = core.handle_message(
4444
+ "Build a comparison table for XR60, R980, and MAX BR1 Pro 5G for police vehicles, including RF connectors, vehicle fit, and conservative antenna-family guidance.",
4445
+ {},
4446
+ mode="auto",
4447
+ audience="auto",
4448
+ show_citations=True,
4449
+ )
4450
+ assert out["meta"]["retrieval_mode"] == "router_vehicle_5g_recommendation_fast"
4451
+ assistant = str(out.get("assistant") or "")
4452
+ assert "4x4 MIMO on SMA" not in assistant
4453
+ assert "Adapter pigtails" not in assistant
4454
+ assert "4x SMA cellular" in assistant
4455
+ assert "2x RP-SMA Wi-Fi" in assistant
4456
+
4457
+
4458
+ def test_unified_kb_aer_docs_only_eval_prompt_stays_variant_conservative() -> None:
4459
+ core = build_core_with(router_core=RepoCsvRouterCore(), router_rag_core=StubRouterRagCompareDocs())
4460
+ out = core._router_multi_model_doc_table_fast(
4461
+ "Compare AER1600 vs AER2200 from docs only and separate clearly documented from not documented."
4462
+ )
4463
+ assert out is not None
4464
+ assert out["meta"]["domain"] == "router_docs"
4465
+ assert out["meta"]["retrieval_mode"] == "router_docs_documented_matrix_fast"
4466
+ assistant = str(out.get("assistant") or "")
4467
+ assert "Needs exact SKU/package; connector path varies across documented family variants." in assistant
4468
+ assert "Needs exact SKU/package; modem path varies across documented family variants." in assistant
4469
+ assert "external reverse-SMA Wi-Fi connectors" not in assistant
4470
+
4471
+
4472
+ def test_unified_kb_vehicle_eval_prompt_avoids_speculative_rf_language() -> None:
4473
+ core = build_core_with(router_core=RepoCsvRouterCore())
4474
+ out = core.handle_message(
4475
+ "Build a decision table comparing XR60, R980, and MAX BR1 Pro 5G for police vehicles with recommended antenna families.",
4476
+ {},
4477
+ mode="auto",
4478
+ audience="auto",
4479
+ show_citations=True,
4480
+ )
4481
+ assert out["meta"]["domain"] == "router_docs"
4482
+ assert out["meta"]["retrieval_mode"] == "router_vehicle_5g_recommendation_fast"
4483
+ assistant = str(out.get("assistant") or "")
4484
+ assert "4x4 MIMO on SMA" not in assistant
4485
+ assert "Adapter pigtails" not in assistant
4486
+ assert ("Needs connector validation" in assistant) or ("Husky" in assistant) or ("Listed, but family not explicit" in assistant)
4487
+
4488
+
4489
  def test_unified_kb_router_decision_table_does_not_default_husky_when_family_is_missing(tmp_path: Path) -> None:
4490
  dec_path = tmp_path / "feb2026routers.csv"
4491
  dec_path.write_text(
 
4535
  assert "Lifecycle note:" not in assistant
4536
 
4537
 
4538
+ def test_unified_kb_br1_docs_only_eval_prompt_avoids_sparse_alias_rows() -> None:
4539
+ core = build_core_with(router_core=RepoCsvRouterCore(), router_rag_core=StubRouterRagCompareDocs())
4540
+ out = core._router_multi_model_doc_table_fast(
4541
+ "Compare MAX BR1 Pro 5G vs MAX BR1 Mini 5G from documented specs only, in table format."
4542
+ )
4543
+ assert out is not None
4544
+ assert out["meta"]["domain"] == "router_docs"
4545
+ assert out["meta"]["retrieval_mode"] == "router_multi_model_doc_table_fast"
4546
+ assistant = str(out.get("assistant") or "")
4547
+ assert "| Internal documented source | feb2026routers.csv | feb2026routers.csv |" in assistant
4548
+ assert "router_pricing_catalog_normalized.csv" not in assistant
4549
+ assert "Adapter pigtails" not in assistant
4550
+
4551
+
4552
  def test_unified_kb_lifecycle_prompt_does_not_invent_given300_model() -> None:
4553
  core = build_core()
4554
  out = core.handle_message(
backend/app/test_unified_kb_router_workbook.py CHANGED
@@ -94,8 +94,12 @@ def test_unified_kb_router_docs_spec_table_defers_to_documented_sources(tmp_path
94
  )
95
 
96
  assert out["meta"]["domain"] == "router_docs"
 
97
  assert not str(out["meta"]["retrieval_mode"]).startswith("deterministic_router_workbook_")
98
  assert out["meta"].get("router_intelligence_source") != "workbook"
 
 
 
99
  assert router_core.calls == 0
100
 
101
 
@@ -163,6 +167,66 @@ def test_unified_kb_router_docs_details_handles_typo_with_workbook(tmp_path: Pat
163
  assert router_core.calls == 0
164
 
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  def test_unified_kb_router_docs_compare_paraphrase_uses_gpt_orchestration(tmp_path: Path, monkeypatch) -> None:
167
  eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
168
  router_rag = StubRouterRag()
@@ -525,6 +589,33 @@ def test_unified_kb_router_lifecycle_fleet_snapshot_prefers_workbook(tmp_path: P
525
  assert router_core.calls == 0
526
 
527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  def test_unified_kb_router_inventory_import_returns_row_confidence_and_alias_corrections(tmp_path: Path) -> None:
529
  workbook_core = _loaded_workbook_core(tmp_path)
530
  core = build_core_with(
 
94
  )
95
 
96
  assert out["meta"]["domain"] == "router_docs"
97
+ assert out["meta"]["retrieval_mode"] == "router_multi_model_doc_caveat_table_fast"
98
  assert not str(out["meta"]["retrieval_mode"]).startswith("deterministic_router_workbook_")
99
  assert out["meta"].get("router_intelligence_source") != "workbook"
100
+ assert "Documented multi-model compare table (internal docs only):" in out["assistant"]
101
+ assert "| Model | WAN/LAN ports | RF connectors | Modem/cellular | Install caveats | Evidence |" in out["assistant"]
102
+ assert "I need stronger internal citations" not in out["assistant"]
103
  assert router_core.calls == 0
104
 
105
 
 
167
  assert router_core.calls == 0
168
 
169
 
170
+ def test_unified_kb_router_docs_mixed_lifecycle_details_prefers_workbook_over_policy_block(tmp_path: Path) -> None:
171
+ eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
172
+ router_rag = StubRouterRag()
173
+ router_core = CountingRouterCore(eos_csv_path=eos_csv, dec_csv_path=dec_csv)
174
+ workbook_core = _loaded_workbook_core(tmp_path)
175
+ core = build_core_with(
176
+ router_rag_core=router_rag,
177
+ router_core=router_core,
178
+ masters_core=StubMastersCore(),
179
+ pots_core=StubPotsCore(),
180
+ rapid_router_intelligence_provider=lambda: workbook_core,
181
+ )
182
+
183
+ out = core.handle_message(
184
+ "Give me the details on Current-500, including lifecycle status, primary use case, WAN/LAN, Wi-Fi, and whether it is still a current recommendation.",
185
+ {},
186
+ mode="auto",
187
+ audience="auto",
188
+ show_citations=True,
189
+ )
190
+
191
+ assert out["meta"]["domain"] == "router_docs"
192
+ assert out["meta"]["retrieval_mode"] == "deterministic_router_workbook_details"
193
+ assert out["meta"]["router_intelligence_source"] == "workbook"
194
+ assert out["meta"].get("reason") is None
195
+ assert "Current-500" in str(out.get("assistant") or "")
196
+ assert router_rag.calls == 0
197
+ assert router_core.calls == 0
198
+
199
+
200
+ def test_unified_kb_router_docs_mixed_lifecycle_compare_prefers_workbook_over_policy_block(tmp_path: Path) -> None:
201
+ eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
202
+ router_rag = StubRouterRag()
203
+ router_core = CountingRouterCore(eos_csv_path=eos_csv, dec_csv_path=dec_csv)
204
+ workbook_core = _loaded_workbook_core(tmp_path)
205
+ core = build_core_with(
206
+ router_rag_core=router_rag,
207
+ router_core=router_core,
208
+ masters_core=StubMastersCore(),
209
+ pots_core=StubPotsCore(),
210
+ rapid_router_intelligence_provider=lambda: workbook_core,
211
+ )
212
+
213
+ out = core.handle_message(
214
+ "Compare Legacy-100 and Current-500, including lifecycle status, WAN/LAN, Wi-Fi, and whether either is still a current recommendation.",
215
+ {},
216
+ mode="auto",
217
+ audience="auto",
218
+ show_citations=True,
219
+ )
220
+
221
+ assert out["meta"]["domain"] == "router_docs"
222
+ assert out["meta"]["retrieval_mode"] == "deterministic_router_workbook_compare"
223
+ assert out["meta"]["router_intelligence_source"] == "workbook"
224
+ assert out["meta"].get("reason") is None
225
+ assert "Workbook-backed router comparison" in str(out.get("assistant") or "")
226
+ assert router_rag.calls == 0
227
+ assert router_core.calls == 0
228
+
229
+
230
  def test_unified_kb_router_docs_compare_paraphrase_uses_gpt_orchestration(tmp_path: Path, monkeypatch) -> None:
231
  eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
232
  router_rag = StubRouterRag()
 
589
  assert router_core.calls == 0
590
 
591
 
592
+ def test_unified_kb_router_lifecycle_fleet_snapshot_keeps_requested_model_labels(tmp_path: Path) -> None:
593
+ eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
594
+ workbook_core = _loaded_workbook_core(tmp_path)
595
+ router_core = CountingRouterCore(eos_csv_path=eos_csv, dec_csv_path=dec_csv)
596
+ core = build_core_with(
597
+ router_rag_core=StubRouterRag(),
598
+ router_core=router_core,
599
+ masters_core=StubMastersCore(),
600
+ pots_core=StubPotsCore(),
601
+ rapid_router_intelligence_provider=lambda: workbook_core,
602
+ )
603
+
604
+ out = core.handle_message(
605
+ "Customer portfolio: 12 Legacy-100, 3 Legacy-NR. Build phased 5G replacement strategy with table.",
606
+ {},
607
+ mode="router_lifecycle",
608
+ audience="auto",
609
+ show_citations=True,
610
+ )
611
+
612
+ assert out["meta"]["retrieval_mode"] == "deterministic_router_workbook_fleet_lifecycle"
613
+ assert "| Unknown | prod_test_legacy | 12 |" in out["assistant"]
614
+ assert "| Unknown | prod_test_no_replacement | 3 |" in out["assistant"]
615
+ assert "Test Corp legacy 4G router" not in out["assistant"]
616
+ assert router_core.calls == 0
617
+
618
+
619
  def test_unified_kb_router_inventory_import_returns_row_confidence_and_alias_corrections(tmp_path: Path) -> None:
620
  workbook_core = _loaded_workbook_core(tmp_path)
621
  core = build_core_with(
backend/app/test_validate_hosted_runtime.py CHANGED
@@ -15,13 +15,14 @@ def _load_validate_hosted_runtime_module():
15
  return module
16
 
17
 
18
- def _args(*, expect_auth_required: bool = True):
19
  return argparse.Namespace(
20
  base_url="https://example.hf.space",
21
  expected_build_version="release-123",
22
  expected_git_sha="abc123",
23
  expect_auth_required=expect_auth_required,
24
  expect_auth_enabled=True,
 
25
  timeout_s=20.0,
26
  out="",
27
  )
@@ -67,6 +68,128 @@ def test_build_report_accepts_protected_health_when_auth_required(monkeypatch) -
67
  assert any("/api/health returned HTTP 401" in warning for warning in report["warnings"])
68
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def test_build_report_rejects_protected_health_when_auth_not_required(monkeypatch) -> None:
71
  module = _load_validate_hosted_runtime_module()
72
 
 
15
  return module
16
 
17
 
18
+ def _args(*, expect_auth_required: bool = True, require_router_workbook_loaded: bool = False):
19
  return argparse.Namespace(
20
  base_url="https://example.hf.space",
21
  expected_build_version="release-123",
22
  expected_git_sha="abc123",
23
  expect_auth_required=expect_auth_required,
24
  expect_auth_enabled=True,
25
+ require_router_workbook_loaded=require_router_workbook_loaded,
26
  timeout_s=20.0,
27
  out="",
28
  )
 
68
  assert any("/api/health returned HTTP 401" in warning for warning in report["warnings"])
69
 
70
 
71
+ def test_build_report_requires_router_workbook_loaded(monkeypatch) -> None:
72
+ module = _load_validate_hosted_runtime_module()
73
+
74
+ def fake_fetch_json(base_url: str, path: str, timeout_s: float):
75
+ if path == "/build-info":
76
+ return {
77
+ "build_version": "release-123",
78
+ "git_sha": "abc123",
79
+ "startup_integrity_ok": True,
80
+ "auth_required": True,
81
+ "auth_enabled": True,
82
+ "auth_config_error": "",
83
+ "auth_config_details": [],
84
+ "auth_config_warnings": [],
85
+ "app_base_url": "https://example.hf.space",
86
+ "vite_app_base_url": "https://example.hf.space",
87
+ }
88
+ if path == "/api/health":
89
+ raise HTTPError(
90
+ url=f"{base_url.rstrip('/')}/api/health",
91
+ code=401,
92
+ msg="Unauthorized",
93
+ hdrs=None,
94
+ fp=None,
95
+ )
96
+ if path == "/api/rapid_router/catalog/status":
97
+ return {"ok": True, "catalog": {"loaded": True, "product_count": 42}}
98
+ raise AssertionError(f"Unexpected path: {path}")
99
+
100
+ monkeypatch.setattr(module, "_fetch_json", fake_fetch_json)
101
+
102
+ report = module._build_report(_args(expect_auth_required=True, require_router_workbook_loaded=True))
103
+
104
+ assert report["ok"] is True
105
+ assert report["checks"]["router_catalog_loaded"] is True
106
+ assert report["checks"]["router_catalog_product_count"] == 42
107
+ assert report["checks"]["router_catalog_access"] == "ok"
108
+
109
+
110
+ def test_build_report_rejects_unloaded_router_workbook(monkeypatch) -> None:
111
+ module = _load_validate_hosted_runtime_module()
112
+
113
+ def fake_fetch_json(base_url: str, path: str, timeout_s: float):
114
+ if path == "/build-info":
115
+ return {
116
+ "build_version": "release-123",
117
+ "git_sha": "abc123",
118
+ "startup_integrity_ok": True,
119
+ "auth_required": True,
120
+ "auth_enabled": True,
121
+ "auth_config_error": "",
122
+ "auth_config_details": [],
123
+ "auth_config_warnings": [],
124
+ "app_base_url": "https://example.hf.space",
125
+ "vite_app_base_url": "https://example.hf.space",
126
+ }
127
+ if path == "/api/health":
128
+ raise HTTPError(
129
+ url=f"{base_url.rstrip('/')}/api/health",
130
+ code=401,
131
+ msg="Unauthorized",
132
+ hdrs=None,
133
+ fp=None,
134
+ )
135
+ if path == "/api/rapid_router/catalog/status":
136
+ return {"ok": True, "catalog": {"loaded": False, "product_count": 0}}
137
+ raise AssertionError(f"Unexpected path: {path}")
138
+
139
+ monkeypatch.setattr(module, "_fetch_json", fake_fetch_json)
140
+
141
+ report = module._build_report(_args(expect_auth_required=True, require_router_workbook_loaded=True))
142
+
143
+ assert report["ok"] is False
144
+ assert any("router workbook catalog is not loaded" in item.lower() for item in report["failures"])
145
+ assert report["checks"]["router_catalog_loaded"] is False
146
+
147
+
148
+ def test_build_report_rejects_unreadable_router_workbook_status(monkeypatch) -> None:
149
+ module = _load_validate_hosted_runtime_module()
150
+
151
+ def fake_fetch_json(base_url: str, path: str, timeout_s: float):
152
+ if path == "/build-info":
153
+ return {
154
+ "build_version": "release-123",
155
+ "git_sha": "abc123",
156
+ "startup_integrity_ok": True,
157
+ "auth_required": True,
158
+ "auth_enabled": True,
159
+ "auth_config_error": "",
160
+ "auth_config_details": [],
161
+ "auth_config_warnings": [],
162
+ "app_base_url": "https://example.hf.space",
163
+ "vite_app_base_url": "https://example.hf.space",
164
+ }
165
+ if path == "/api/health":
166
+ raise HTTPError(
167
+ url=f"{base_url.rstrip('/')}/api/health",
168
+ code=401,
169
+ msg="Unauthorized",
170
+ hdrs=None,
171
+ fp=None,
172
+ )
173
+ if path == "/api/rapid_router/catalog/status":
174
+ raise HTTPError(
175
+ url=f"{base_url.rstrip('/')}/api/rapid_router/catalog/status",
176
+ code=403,
177
+ msg="Forbidden",
178
+ hdrs=None,
179
+ fp=None,
180
+ )
181
+ raise AssertionError(f"Unexpected path: {path}")
182
+
183
+ monkeypatch.setattr(module, "_fetch_json", fake_fetch_json)
184
+
185
+ report = module._build_report(_args(expect_auth_required=True, require_router_workbook_loaded=True))
186
+
187
+ assert report["ok"] is False
188
+ assert any("/api/rapid_router/catalog/status could not be validated" in item for item in report["failures"])
189
+ assert report["checks"]["router_catalog_access"] == "protected"
190
+ assert report["checks"]["router_catalog_status_code"] == 403
191
+
192
+
193
  def test_build_report_rejects_protected_health_when_auth_not_required(monkeypatch) -> None:
194
  module = _load_validate_hosted_runtime_module()
195
 
backend/scripts/run_router_canary_ab_eval_shard.py ADDED
@@ -0,0 +1,657 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import csv
6
+ import importlib.util
7
+ import json
8
+ import os
9
+ import re
10
+ import subprocess
11
+ import sys
12
+ import time
13
+ from copy import deepcopy
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List, Optional, Sequence, Tuple
16
+ from urllib.error import HTTPError, URLError
17
+
18
+ import certifi # type: ignore
19
+ import requests
20
+ from playwright.sync_api import BrowserContext, Page, TimeoutError as PlaywrightTimeoutError, sync_playwright
21
+
22
+ REPO_ROOT = Path(__file__).resolve().parents[2]
23
+ FRONTEND_E2E_ENV = REPO_ROOT / "frontend" / ".env.e2e"
24
+ VALIDATOR_SCRIPT = REPO_ROOT / "backend" / "scripts" / "validate_hosted_runtime.py"
25
+ SCORER_SCRIPT = REPO_ROOT / "backend" / "scripts" / "score_router_canary_ab_responses.py"
26
+ SURVEY_SCOPE = "knowledgebase"
27
+ DEFAULT_TIMEOUT_S = 30.0
28
+
29
+
30
+ def _load_python_module(path: Path, module_name: str) -> Any:
31
+ spec = importlib.util.spec_from_file_location(module_name, path)
32
+ if spec is None or spec.loader is None:
33
+ raise RuntimeError(f"Could not load module from {path}")
34
+ module = importlib.util.module_from_spec(spec)
35
+ spec.loader.exec_module(module)
36
+ return module
37
+
38
+
39
+ def _validator_module() -> Any:
40
+ return _load_python_module(VALIDATOR_SCRIPT, "validate_hosted_runtime")
41
+
42
+
43
+ def _survey_fixture_bytes() -> bytes:
44
+ backend_dir = REPO_ROOT / "backend"
45
+ if str(backend_dir) not in sys.path:
46
+ sys.path.insert(0, str(backend_dir))
47
+ from app.rapid_router.test_catalog_db import _survey_workbook_bytes # type: ignore
48
+
49
+ return _survey_workbook_bytes()
50
+
51
+
52
+ def _read_env_file(path: Path) -> Dict[str, str]:
53
+ data: Dict[str, str] = {}
54
+ if not path.exists():
55
+ return data
56
+ for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
57
+ line = raw_line.strip()
58
+ if (not line) or line.startswith("#") or ("=" not in line):
59
+ continue
60
+ key, value = line.split("=", 1)
61
+ data[key.strip()] = value.strip().strip("'").strip('"')
62
+ return data
63
+
64
+
65
+ def _auth_settings(env_file: Path) -> Dict[str, str]:
66
+ env_map = _read_env_file(env_file)
67
+ auth_domain = str(os.getenv("E2E_AUTH0_DOMAIN") or env_map.get("E2E_AUTH0_DOMAIN") or "").strip().lower()
68
+ auth_email = str(os.getenv("E2E_AUTH_TEST_EMAIL") or env_map.get("E2E_AUTH_TEST_EMAIL") or "").strip()
69
+ auth_password = str(os.getenv("E2E_AUTH_TEST_PASSWORD") or env_map.get("E2E_AUTH_TEST_PASSWORD") or "").strip()
70
+ if not (auth_domain and auth_email and auth_password):
71
+ raise RuntimeError(f"Hosted auth credentials are not fully configured in {env_file}.")
72
+ return {
73
+ "auth_domain": auth_domain,
74
+ "auth_email": auth_email,
75
+ "auth_password": auth_password,
76
+ }
77
+
78
+
79
+ def _load_rows(path: Path) -> Tuple[List[Dict[str, str]], List[str]]:
80
+ with path.open("r", encoding="utf-8", newline="") as handle:
81
+ reader = csv.DictReader(handle)
82
+ headers = list(reader.fieldnames or [])
83
+ rows = [{str(key): str(value or "") for key, value in row.items()} for row in reader]
84
+ return rows, headers
85
+
86
+
87
+ def _write_rows(path: Path, rows: Sequence[Dict[str, str]], headers: Sequence[str]) -> None:
88
+ path.parent.mkdir(parents=True, exist_ok=True)
89
+ with path.open("w", encoding="utf-8", newline="") as handle:
90
+ writer = csv.DictWriter(handle, fieldnames=list(headers))
91
+ writer.writeheader()
92
+ for row in rows:
93
+ writer.writerow({header: str(row.get(header, "")) for header in headers})
94
+
95
+
96
+ def _safe_json_load(raw: str, default: Any) -> Any:
97
+ text = str(raw or "").strip()
98
+ if not text:
99
+ return deepcopy(default)
100
+ try:
101
+ parsed = json.loads(text)
102
+ except Exception:
103
+ return deepcopy(default)
104
+ return parsed
105
+
106
+
107
+ def _json_field(value: Any) -> str:
108
+ if value in ("", None):
109
+ return ""
110
+ return json.dumps(value, ensure_ascii=False)
111
+
112
+
113
+ def _current_git_sha() -> str:
114
+ result = subprocess.run(
115
+ ["git", "rev-parse", "HEAD"],
116
+ cwd=REPO_ROOT,
117
+ check=True,
118
+ capture_output=True,
119
+ text=True,
120
+ )
121
+ return str(result.stdout or "").strip()
122
+
123
+
124
+ def _run_hosted_preflight(*, base_url: str, expected_git_sha: str, expected_build_version: str, timeout_s: float) -> Dict[str, Any]:
125
+ validate = _validator_module()
126
+ args = argparse.Namespace(
127
+ base_url=base_url,
128
+ expected_build_version=expected_build_version,
129
+ expected_git_sha=expected_git_sha,
130
+ expect_auth_required=True,
131
+ expect_auth_enabled=True,
132
+ require_router_workbook_loaded=True,
133
+ timeout_s=timeout_s,
134
+ out="",
135
+ )
136
+ try:
137
+ return validate._build_report(args)
138
+ except (HTTPError, URLError, TimeoutError, json.JSONDecodeError) as exc:
139
+ return {
140
+ "ok": False,
141
+ "failures": [f"Hosted validation request failed: {type(exc).__name__}: {exc}"],
142
+ "warnings": [],
143
+ "checks": {"base_url": base_url.rstrip("/")},
144
+ }
145
+
146
+
147
+ def _safe_host(url_value: str) -> str:
148
+ from urllib.parse import urlparse
149
+
150
+ try:
151
+ return str(urlparse(url_value).hostname or "").lower()
152
+ except Exception:
153
+ return ""
154
+
155
+
156
+ def _visible(page: Page, selector: str) -> bool:
157
+ try:
158
+ return bool(page.locator(selector).first.is_visible(timeout=4_000))
159
+ except Exception:
160
+ return False
161
+
162
+
163
+ def _is_app_shell_visible(page: Page) -> bool:
164
+ checks = [
165
+ page.get_by_title("Open account menu").first,
166
+ page.get_by_role("tab", name=re.compile("Knowledgebase", re.IGNORECASE)).first,
167
+ page.get_by_label(re.compile(r"Message the .*assistant", re.IGNORECASE)).first,
168
+ ]
169
+ for locator in checks:
170
+ try:
171
+ if locator.is_visible(timeout=300):
172
+ return True
173
+ except Exception:
174
+ continue
175
+ return False
176
+
177
+
178
+ def _wait_for_gate_or_app(page: Page, timeout_ms: int = 90_000) -> str:
179
+ deadline = time.time() + (timeout_ms / 1000.0)
180
+ while time.time() < deadline:
181
+ if _is_app_shell_visible(page):
182
+ return "app"
183
+ try:
184
+ if page.get_by_role("heading", name=re.compile("Sign in required", re.IGNORECASE)).is_visible(timeout=300):
185
+ return "gate"
186
+ except Exception:
187
+ pass
188
+ try:
189
+ body_text = str(page.text_content("body", timeout=300) or "").lower()
190
+ except Exception:
191
+ body_text = ""
192
+ if "preparing space" in body_text:
193
+ page.wait_for_timeout(750)
194
+ continue
195
+ page.wait_for_timeout(200)
196
+ return "timeout"
197
+
198
+
199
+ def _wait_for_auth_transition(page: Page, *, app_host: str, auth_domain: str, timeout_ms: int = 30_000) -> str:
200
+ deadline = time.time() + (timeout_ms / 1000.0)
201
+ while time.time() < deadline:
202
+ host = _safe_host(page.url)
203
+ if auth_domain and (auth_domain in host):
204
+ return "auth0"
205
+ try:
206
+ if page.get_by_role("heading", name=re.compile("Authentication configuration error", re.IGNORECASE)).is_visible(timeout=250):
207
+ return "error"
208
+ except Exception:
209
+ pass
210
+ if host == app_host and _is_app_shell_visible(page):
211
+ return "app"
212
+ page.wait_for_timeout(200)
213
+ return "timeout"
214
+
215
+
216
+ def _complete_auth0_login(page: Page, *, email: str, password: str) -> None:
217
+ username_selector = 'input[name="username"], input[name="email"], input[type="email"]'
218
+ password_selector = 'input[name="password"], input[type="password"]'
219
+ submit_selector = 'button[type="submit"], button[name="action"]'
220
+ if not _visible(page, username_selector):
221
+ return
222
+ page.locator(username_selector).first.fill(email)
223
+ page.locator(password_selector).first.fill(password)
224
+ page.locator(submit_selector).first.click()
225
+ continue_btn = page.get_by_role("button", name=re.compile("continue|accept|allow", re.IGNORECASE)).first
226
+ try:
227
+ if continue_btn.is_visible(timeout=2_500):
228
+ continue_btn.click()
229
+ except Exception:
230
+ pass
231
+
232
+
233
+ def _ensure_logged_in(page: Page, context: BrowserContext, *, base_url: str, auth_domain: str, email: str, password: str) -> None:
234
+ app_host = _safe_host(base_url)
235
+ context.clear_cookies()
236
+ page.goto(base_url, wait_until="domcontentloaded")
237
+ page.evaluate("() => { window.localStorage.clear(); window.sessionStorage.clear(); }")
238
+ page.goto(base_url, wait_until="domcontentloaded")
239
+
240
+ for _attempt in range(2):
241
+ state = _wait_for_gate_or_app(page)
242
+ if state == "app":
243
+ break
244
+ if state == "timeout":
245
+ raise RuntimeError("App did not render auth gate or authenticated shell in time.")
246
+ page.get_by_role("button", name=re.compile("Log in", re.IGNORECASE)).click()
247
+ transition = _wait_for_auth_transition(page, app_host=app_host, auth_domain=auth_domain)
248
+ if transition == "error":
249
+ raise RuntimeError("Auth callback error screen shown after clicking Log in.")
250
+ if transition == "timeout":
251
+ raise RuntimeError("Log in click did not start an auth transition.")
252
+ if auth_domain and (auth_domain in _safe_host(page.url)):
253
+ _complete_auth0_login(page, email=email, password=password)
254
+ deadline = time.time() + 60.0
255
+ while time.time() < deadline:
256
+ if _safe_host(page.url) == app_host:
257
+ break
258
+ page.wait_for_timeout(200)
259
+
260
+ if _safe_host(page.url) != app_host:
261
+ raise RuntimeError("Did not return to the hosted app after Auth0 login.")
262
+ final_state = _wait_for_gate_or_app(page)
263
+ if final_state == "gate":
264
+ raise RuntimeError("Still on Sign in required screen after auth flow completed.")
265
+ if final_state != "app":
266
+ raise RuntimeError("Authenticated app shell did not become visible.")
267
+
268
+
269
+ def _read_auth_token(page: Page) -> str:
270
+ token = page.evaluate(
271
+ """() => {
272
+ for (let i = 0; i < window.localStorage.length; i += 1) {
273
+ const key = window.localStorage.key(i);
274
+ if (!key || !key.includes('@@auth0spajs@@')) continue;
275
+ try {
276
+ const raw = window.localStorage.getItem(key);
277
+ if (!raw) continue;
278
+ const parsed = JSON.parse(raw);
279
+ const accessToken = String(parsed?.body?.access_token || '').trim();
280
+ if (accessToken) return accessToken;
281
+ const idToken = String(parsed?.id_token || '').trim();
282
+ if (idToken) return idToken;
283
+ } catch (err) {
284
+ // ignore malformed cache entries
285
+ }
286
+ }
287
+ return '';
288
+ }"""
289
+ )
290
+ token_text = str(token or "").strip()
291
+ if not token_text:
292
+ raise RuntimeError("Could not read an Auth0 token from browser storage.")
293
+ return token_text
294
+
295
+
296
+ def _solve_captcha_prompt(prompt: str) -> str:
297
+ import re
298
+
299
+ match = re.search(r"(-?\d+)\s*([+\-*/xX])\s*(-?\d+)", str(prompt or ""))
300
+ if not match:
301
+ raise RuntimeError(f"Unsupported captcha prompt: {prompt}")
302
+ left = int(match.group(1))
303
+ op = match.group(2)
304
+ right = int(match.group(3))
305
+ if op == "+":
306
+ return str(left + right)
307
+ if op == "-":
308
+ return str(left - right)
309
+ if op in {"*", "x", "X"}:
310
+ return str(left * right)
311
+ if op == "/":
312
+ return str(left / right)
313
+ raise RuntimeError(f"Unsupported captcha operator: {op}")
314
+
315
+
316
+ def _ensure_knowledgebase_captcha(session: requests.Session, *, base_url: str) -> str:
317
+ challenge = session.get(
318
+ f"{base_url.rstrip('/')}/api/captcha/challenge",
319
+ params={"scope": SURVEY_SCOPE},
320
+ timeout=DEFAULT_TIMEOUT_S,
321
+ )
322
+ if challenge.status_code == 404:
323
+ return ""
324
+ challenge.raise_for_status()
325
+ payload = challenge.json()
326
+ if bool(payload.get("enabled", True)) is False:
327
+ return ""
328
+ prompt = str(payload.get("prompt") or "").strip()
329
+ challenge_id = str(payload.get("challenge_id") or "").strip()
330
+ if not (prompt and challenge_id):
331
+ return ""
332
+ verify = session.post(
333
+ f"{base_url.rstrip('/')}/api/captcha/verify",
334
+ json={
335
+ "scope": SURVEY_SCOPE,
336
+ "challenge_id": challenge_id,
337
+ "answer": _solve_captcha_prompt(prompt),
338
+ },
339
+ timeout=DEFAULT_TIMEOUT_S,
340
+ )
341
+ verify.raise_for_status()
342
+ verify_payload = verify.json()
343
+ if bool(verify_payload.get("enabled", True)) is False:
344
+ return ""
345
+ token = str(verify_payload.get("token") or "").strip()
346
+ if not token:
347
+ raise RuntimeError("Captcha verify succeeded but did not return a token.")
348
+ return token
349
+
350
+
351
+ def _requests_session(*, bearer_token: str, captcha_token: str) -> requests.Session:
352
+ session = requests.Session()
353
+ session.verify = certifi.where()
354
+ session.headers.update({"Authorization": f"Bearer {bearer_token}"})
355
+ if captcha_token:
356
+ session.headers.update({"X-Captcha-Token": captcha_token})
357
+ return session
358
+
359
+
360
+ def _upload_seed_survey(session: requests.Session, *, base_url: str, seed_mode: str, survey_workbook_path: str) -> str:
361
+ if seed_mode == "none" and not survey_workbook_path:
362
+ return ""
363
+ if seed_mode == "synthetic":
364
+ workbook_bytes = _survey_fixture_bytes()
365
+ filename = "router_canary_eval_synthetic_site_survey.xlsx"
366
+ else:
367
+ path = Path(survey_workbook_path).expanduser().resolve()
368
+ workbook_bytes = path.read_bytes()
369
+ filename = path.name
370
+ response = session.post(
371
+ f"{base_url.rstrip('/')}/api/rapid_router/surveys/upload",
372
+ files={
373
+ "file": (
374
+ filename,
375
+ workbook_bytes,
376
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
377
+ )
378
+ },
379
+ timeout=max(DEFAULT_TIMEOUT_S, 120.0),
380
+ )
381
+ response.raise_for_status()
382
+ payload = response.json()
383
+ survey_key = str(((payload.get("survey") or {}) if isinstance(payload.get("survey"), dict) else {}).get("survey_key") or "").strip()
384
+ if not survey_key:
385
+ raise RuntimeError("Survey upload succeeded but did not return a survey key.")
386
+ return survey_key
387
+
388
+
389
+ def _collect_steps(row: Dict[str, str]) -> List[str]:
390
+ steps_payload = _safe_json_load(row.get("conversation_steps_json") or "", [])
391
+ steps: List[str] = []
392
+ if isinstance(steps_payload, list):
393
+ for item in steps_payload:
394
+ if isinstance(item, str) and str(item).strip():
395
+ steps.append(str(item).strip())
396
+ elif isinstance(item, dict):
397
+ text = str(item.get("message") or item.get("prompt") or "").strip()
398
+ if text:
399
+ steps.append(text)
400
+ if steps:
401
+ return steps
402
+ prompt = str(row.get("prompt") or "").strip()
403
+ return [prompt] if prompt else []
404
+
405
+
406
+ def _base_state_from_payload(row: Dict[str, str], *, survey_key: str) -> Dict[str, Any]:
407
+ payload = _safe_json_load(row.get("api_payload_template_json") or "", {})
408
+ state = payload.get("state") if isinstance(payload.get("state"), dict) else {}
409
+ seeded = deepcopy(state)
410
+ if str(row.get("setup_kind") or "") == "active_survey_required" and survey_key:
411
+ router_state = seeded.get("router_lifecycle_state") if isinstance(seeded.get("router_lifecycle_state"), dict) else {}
412
+ router_state = {**router_state, "last_survey_key": survey_key}
413
+ seeded["router_lifecycle_state"] = router_state
414
+ return seeded
415
+
416
+
417
+ def _run_message_step(
418
+ session: requests.Session,
419
+ *,
420
+ base_url: str,
421
+ payload_template: Dict[str, Any],
422
+ message: str,
423
+ state: Dict[str, Any],
424
+ request_id: str,
425
+ ) -> Tuple[requests.Response, Dict[str, Any], float]:
426
+ body = deepcopy(payload_template)
427
+ body["message"] = message
428
+ body["state"] = state
429
+ body["request_id"] = request_id
430
+ started = time.perf_counter()
431
+ response = session.post(
432
+ f"{base_url.rstrip('/')}/api/knowledgebase/message",
433
+ json=body,
434
+ timeout=max(DEFAULT_TIMEOUT_S, 120.0),
435
+ )
436
+ latency_ms = round((time.perf_counter() - started) * 1000.0, 2)
437
+ parsed: Dict[str, Any] = {}
438
+ try:
439
+ parsed = response.json()
440
+ if not isinstance(parsed, dict):
441
+ parsed = {}
442
+ except Exception:
443
+ parsed = {}
444
+ return response, parsed, latency_ms
445
+
446
+
447
+ def _row_result_template(row: Dict[str, str]) -> Dict[str, str]:
448
+ current = dict(row)
449
+ for key in (
450
+ "run_status",
451
+ "http_status",
452
+ "request_id",
453
+ "latency_ms",
454
+ "response_assistant",
455
+ "response_sources_json",
456
+ "response_files_json",
457
+ "response_meta_json",
458
+ "response_state_json",
459
+ "response_error",
460
+ ):
461
+ current[key] = str(current.get(key, "") or "")
462
+ return current
463
+
464
+
465
+ def _score_if_requested(*, in_csv: Path, should_score: bool) -> None:
466
+ if not should_score:
467
+ return
468
+ subprocess.run(
469
+ ["python3", str(SCORER_SCRIPT), "--in-csv", str(in_csv)],
470
+ cwd=REPO_ROOT,
471
+ check=True,
472
+ )
473
+
474
+
475
+ def run_shard(
476
+ *,
477
+ in_csv: Path,
478
+ out_csv: Path,
479
+ base_url: str,
480
+ env_file: Path,
481
+ expected_git_sha: str,
482
+ expected_build_version: str,
483
+ seed_survey: str,
484
+ survey_workbook_path: str,
485
+ timeout_s: float,
486
+ headed: bool,
487
+ score_after: bool,
488
+ ) -> Dict[str, Any]:
489
+ rows, headers = _load_rows(in_csv)
490
+ report = _run_hosted_preflight(
491
+ base_url=base_url,
492
+ expected_git_sha=expected_git_sha,
493
+ expected_build_version=expected_build_version,
494
+ timeout_s=timeout_s,
495
+ )
496
+ if not report.get("ok"):
497
+ failures = "\n".join(f"- {item}" for item in list(report.get("failures") or []))
498
+ raise RuntimeError(f"Hosted preflight failed before shard execution:\n{failures}")
499
+
500
+ auth = _auth_settings(env_file)
501
+ survey_rows_needed = any(str(row.get("setup_kind") or "") == "active_survey_required" for row in rows)
502
+ attempted = 0
503
+ completed = 0
504
+ deferred = 0
505
+
506
+ with sync_playwright() as p:
507
+ browser = p.chromium.launch(headless=(not headed))
508
+ context = browser.new_context(ignore_https_errors=False)
509
+ page = context.new_page()
510
+ _ensure_logged_in(
511
+ page,
512
+ context,
513
+ base_url=base_url,
514
+ auth_domain=auth["auth_domain"],
515
+ email=auth["auth_email"],
516
+ password=auth["auth_password"],
517
+ )
518
+ bearer_token = _read_auth_token(page)
519
+ session = _requests_session(bearer_token=bearer_token, captcha_token="")
520
+ try:
521
+ captcha_token = _ensure_knowledgebase_captcha(session, base_url=base_url)
522
+ except Exception:
523
+ captcha_token = ""
524
+ if captcha_token:
525
+ session.headers.update({"X-Captcha-Token": captcha_token})
526
+
527
+ seeded_survey_key = ""
528
+ if survey_rows_needed and (seed_survey != "none" or survey_workbook_path):
529
+ seeded_survey_key = _upload_seed_survey(
530
+ session,
531
+ base_url=base_url,
532
+ seed_mode=seed_survey,
533
+ survey_workbook_path=survey_workbook_path,
534
+ )
535
+
536
+ processed: List[Dict[str, str]] = []
537
+ for row in rows:
538
+ current = _row_result_template(row)
539
+ setup_kind = str(row.get("setup_kind") or "").strip()
540
+ payload_template = _safe_json_load(row.get("api_payload_template_json") or "", {})
541
+ steps = _collect_steps(row)
542
+ if not steps:
543
+ current["run_status"] = "invalid_row"
544
+ current["response_error"] = "No executable prompt or conversation steps were present for this row."
545
+ processed.append(current)
546
+ continue
547
+
548
+ if setup_kind == "active_survey_required" and not seeded_survey_key:
549
+ current["run_status"] = "deferred_active_survey_required"
550
+ current["response_error"] = "Active survey context not available for this shard execution."
551
+ deferred += 1
552
+ processed.append(current)
553
+ continue
554
+
555
+ attempted += 1
556
+ base_state = _base_state_from_payload(row, survey_key=seeded_survey_key)
557
+ current_state = deepcopy(base_state)
558
+ final_payload: Dict[str, Any] = {}
559
+ final_response: Optional[requests.Response] = None
560
+ last_latency_ms = 0.0
561
+ run_status = "completed"
562
+ response_error = ""
563
+
564
+ for step_index, step_message in enumerate(steps, start=1):
565
+ request_id = str(payload_template.get("request_id") or row.get("case_id") or "router-canary-eval").strip()
566
+ if len(steps) > 1:
567
+ request_id = f"{request_id}-step-{step_index}"
568
+ response, parsed, latency_ms = _run_message_step(
569
+ session,
570
+ base_url=base_url,
571
+ payload_template=payload_template,
572
+ message=step_message,
573
+ state=current_state,
574
+ request_id=request_id,
575
+ )
576
+ final_response = response
577
+ final_payload = parsed
578
+ last_latency_ms = latency_ms
579
+ if response.status_code >= 400:
580
+ run_status = "http_error"
581
+ response_error = str(parsed.get("detail") or parsed.get("error") or response.text[:500]).strip()
582
+ break
583
+ if isinstance(parsed.get("state"), dict):
584
+ current_state = parsed.get("state") or current_state
585
+
586
+ current["run_status"] = run_status
587
+ if final_response is not None:
588
+ current["http_status"] = str(final_response.status_code)
589
+ current["request_id"] = str(final_response.headers.get("x-request-id") or payload_template.get("request_id") or row.get("case_id") or "")
590
+ current["latency_ms"] = str(last_latency_ms)
591
+ if final_payload and run_status == "completed":
592
+ current["response_assistant"] = str(final_payload.get("assistant") or "")
593
+ current["response_sources_json"] = _json_field(final_payload.get("sources") or [])
594
+ current["response_files_json"] = _json_field(final_payload.get("files") or [])
595
+ current["response_meta_json"] = _json_field(final_payload.get("meta") or {})
596
+ current["response_state_json"] = _json_field(final_payload.get("state") or current_state)
597
+ current["response_error"] = ""
598
+ completed += 1
599
+ else:
600
+ current["response_error"] = response_error
601
+ if final_payload.get("state"):
602
+ current["response_state_json"] = _json_field(final_payload.get("state"))
603
+ processed.append(current)
604
+
605
+ browser.close()
606
+
607
+ _write_rows(out_csv, processed, headers)
608
+ _score_if_requested(in_csv=out_csv, should_score=score_after)
609
+ return {
610
+ "rows_total": len(rows),
611
+ "rows_attempted": attempted,
612
+ "rows_completed": completed,
613
+ "rows_deferred": deferred,
614
+ "seeded_survey_key": seeded_survey_key,
615
+ "preflight": report,
616
+ "out_csv": str(out_csv),
617
+ }
618
+
619
+
620
+ def main() -> int:
621
+ parser = argparse.ArgumentParser(description="Run one router canary A/B eval shard against the hosted canary.")
622
+ parser.add_argument("--in-csv", required=True, help="Input shard CSV path.")
623
+ parser.add_argument("--out-csv", default="", help="Optional output CSV path. Defaults to updating the input CSV in place.")
624
+ parser.add_argument("--base-url", required=True, help="Hosted base URL, for example https://owner-space.hf.space")
625
+ parser.add_argument("--auth-env-file", default=str(FRONTEND_E2E_ENV), help="Path to the hosted E2E env file.")
626
+ parser.add_argument("--expected-git-sha", default="", help="Expected hosted git SHA. Defaults to local HEAD.")
627
+ parser.add_argument("--expected-build-version", default="", help="Expected hosted build version, if known.")
628
+ parser.add_argument("--seed-survey", choices=("none", "synthetic"), default="none", help="How to seed active survey rows.")
629
+ parser.add_argument("--survey-workbook-path", default="", help="Optional .xlsx survey workbook to upload instead of the synthetic fixture.")
630
+ parser.add_argument("--timeout-s", type=float, default=30.0, help="Hosted preflight timeout per request.")
631
+ parser.add_argument("--headed", action="store_true", help="Run Playwright headed for debugging.")
632
+ parser.add_argument("--score-after", action="store_true", help="Run the OpenAI scoring script after writing the shard CSV.")
633
+ args = parser.parse_args()
634
+
635
+ in_csv = Path(args.in_csv).expanduser().resolve()
636
+ out_csv = Path(args.out_csv).expanduser().resolve() if args.out_csv else in_csv
637
+ expected_git_sha = str(args.expected_git_sha or "").strip() or _current_git_sha()
638
+
639
+ summary = run_shard(
640
+ in_csv=in_csv,
641
+ out_csv=out_csv,
642
+ base_url=str(args.base_url).strip(),
643
+ env_file=Path(args.auth_env_file).expanduser().resolve(),
644
+ expected_git_sha=expected_git_sha,
645
+ expected_build_version=str(args.expected_build_version or "").strip(),
646
+ seed_survey=str(args.seed_survey or "none").strip(),
647
+ survey_workbook_path=str(args.survey_workbook_path or "").strip(),
648
+ timeout_s=float(args.timeout_s or 30.0),
649
+ headed=bool(args.headed),
650
+ score_after=bool(args.score_after),
651
+ )
652
+ print(json.dumps(summary, indent=2))
653
+ return 0
654
+
655
+
656
+ if __name__ == "__main__":
657
+ raise SystemExit(main())
backend/scripts/score_router_canary_ab_responses.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import csv
6
+ import json
7
+ import os
8
+ import re
9
+ from collections import defaultdict
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ try:
15
+ from openai import OpenAI # type: ignore
16
+ except Exception: # pragma: no cover
17
+ OpenAI = None # type: ignore
18
+
19
+
20
+ def _is_placeholder_key(value: str) -> bool:
21
+ candidate = str(value or "").strip()
22
+ if not candidate:
23
+ return True
24
+ upper = candidate.upper()
25
+ return upper.startswith("YOUR_KEY") or upper in {"<YOUR_OPENAI_API_KEY>", "YOUR_OPENAI_API_KEY", "REPLACE_ME"}
26
+
27
+
28
+ def _safe_float(value: Any) -> Optional[float]:
29
+ try:
30
+ if isinstance(value, str):
31
+ candidate = value.strip().rstrip("%").strip()
32
+ if not candidate:
33
+ return None
34
+ return float(candidate)
35
+ return float(value)
36
+ except Exception:
37
+ return None
38
+
39
+
40
+ def _safe_json_load(raw: str) -> Dict[str, Any]:
41
+ text = str(raw or "").strip()
42
+ if not text:
43
+ return {}
44
+ try:
45
+ parsed = json.loads(text)
46
+ return parsed if isinstance(parsed, dict) else {}
47
+ except Exception:
48
+ pass
49
+ match = re.search(r"\{.*\}", text, flags=re.DOTALL)
50
+ if not match:
51
+ return {}
52
+ try:
53
+ parsed = json.loads(match.group(0))
54
+ return parsed if isinstance(parsed, dict) else {}
55
+ except Exception:
56
+ return {}
57
+
58
+
59
+ def _grade_letter(score: float) -> str:
60
+ if score >= 90:
61
+ return "A"
62
+ if score >= 80:
63
+ return "B"
64
+ if score >= 70:
65
+ return "C"
66
+ if score >= 60:
67
+ return "D"
68
+ return "F"
69
+
70
+
71
+ def _unwrap_semantic_json_payload(raw: str) -> Dict[str, Any]:
72
+ current = str(raw or "").strip()
73
+ if not current:
74
+ return {}
75
+ for _ in range(4):
76
+ fenced = re.match(r"^\s*```(?:json)?\s*(.*?)\s*```\s*$", current, flags=re.IGNORECASE | re.DOTALL)
77
+ if fenced:
78
+ current = str(fenced.group(1) or "").strip()
79
+ continue
80
+ parsed = _safe_json_load(current)
81
+ if parsed:
82
+ return parsed
83
+ break
84
+ return {}
85
+
86
+
87
+ def _normalize_score(value: Any, fallback: Any = 0) -> int:
88
+ numeric = _safe_float(value)
89
+ if numeric is None:
90
+ numeric = _safe_float(fallback)
91
+ if numeric is None:
92
+ numeric = 0.0
93
+ if numeric <= 1.0:
94
+ numeric *= 100.0
95
+ elif numeric <= 10.0:
96
+ numeric *= 10.0
97
+ return int(max(0, min(100, round(numeric))))
98
+
99
+
100
+ def _normalize_grade_payload(parsed: Dict[str, Any]) -> Dict[str, Any]:
101
+ fact = _normalize_score(parsed.get("fact_score"))
102
+ instruction = _normalize_score(parsed.get("instruction_score"))
103
+ coverage = _normalize_score(parsed.get("coverage_score"))
104
+ readability = _normalize_score(parsed.get("readability_score"))
105
+ safety = _normalize_score(parsed.get("safety_score"))
106
+ overall = _normalize_score(parsed.get("overall_score"), round((fact + instruction + coverage + readability + safety) / 5))
107
+ issues = parsed.get("issues") if isinstance(parsed.get("issues"), list) else []
108
+ rationale = str(parsed.get("rationale") or "").strip()
109
+ return {
110
+ "fact_score": fact,
111
+ "instruction_score": instruction,
112
+ "coverage_score": coverage,
113
+ "readability_score": readability,
114
+ "safety_score": safety,
115
+ "overall_score": overall,
116
+ "issues": [str(item) for item in issues[:10]],
117
+ "rationale": rationale[:600],
118
+ "grade": _grade_letter(float(overall)),
119
+ }
120
+
121
+
122
+ def _score_row(
123
+ row: Dict[str, str],
124
+ *,
125
+ client: Any,
126
+ model: str,
127
+ max_sources: int = 4,
128
+ ) -> Dict[str, Any]:
129
+ assistant = str(row.get("response_assistant") or "").strip()
130
+ if not assistant:
131
+ return {
132
+ "skipped": True,
133
+ "issues": ["missing_response_assistant"],
134
+ "rationale": "No assistant answer was present in the CSV row.",
135
+ }
136
+
137
+ try:
138
+ raw_sources = json.loads(str(row.get("response_sources_json") or "[]"))
139
+ sources = raw_sources if isinstance(raw_sources, list) else []
140
+ except Exception:
141
+ sources = []
142
+ trimmed_sources: List[Dict[str, str]] = []
143
+ for source in list(sources)[:max_sources]:
144
+ if not isinstance(source, dict):
145
+ continue
146
+ trimmed_sources.append(
147
+ {
148
+ "id": str(source.get("id") or ""),
149
+ "doc": str(source.get("doc") or ""),
150
+ "relative_path": str(source.get("relative_path") or ""),
151
+ "chunk_id": str(source.get("chunk_id") or ""),
152
+ "excerpt": str(source.get("excerpt") or "")[:260],
153
+ }
154
+ )
155
+
156
+ meta = _safe_json_load(str(row.get("response_meta_json") or ""))
157
+ system_msg = (
158
+ "You are a strict QA grader for an internal router-intelligence assistant. "
159
+ "Grade the answer on five dimensions: fact_score, instruction_score, coverage_score, readability_score, and safety_score. "
160
+ "The assistant must be conservative, current-only by default unless the prompt explicitly asks for legacy, "
161
+ "must not invent specs/lifecycle/pricing/policy, and should ask for clarification or abstain when setup or evidence is missing. "
162
+ "Return strict JSON with keys: fact_score, instruction_score, coverage_score, readability_score, safety_score, overall_score, issues, rationale. "
163
+ "Use numeric scores on a 0-100 scale where 100 is best."
164
+ )
165
+ payload = {
166
+ "case_id": row.get("case_id"),
167
+ "family": row.get("family"),
168
+ "subfamily": row.get("subfamily"),
169
+ "difficulty": row.get("difficulty"),
170
+ "mode_hint": row.get("mode_hint"),
171
+ "setup_kind": row.get("setup_kind"),
172
+ "run_readiness": row.get("run_readiness"),
173
+ "judge_focus": row.get("judge_focus"),
174
+ "prompt": row.get("prompt"),
175
+ "notes": row.get("notes"),
176
+ "assistant": assistant[:6500],
177
+ "sources": trimmed_sources,
178
+ "meta": meta,
179
+ }
180
+ parsed: Dict[str, Any] = {}
181
+ for _attempt in range(2):
182
+ response = client.responses.create(
183
+ model=model,
184
+ input=[
185
+ {"role": "system", "content": system_msg},
186
+ {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
187
+ ],
188
+ reasoning={"effort": "minimal"},
189
+ max_output_tokens=700,
190
+ )
191
+ parsed = _unwrap_semantic_json_payload(str(getattr(response, "output_text", "") or ""))
192
+ if parsed:
193
+ break
194
+ if not parsed:
195
+ return {
196
+ "skipped": False,
197
+ "issues": ["invalid_judge_payload"],
198
+ "rationale": "The OpenAI grader did not return a parseable JSON payload after one retry.",
199
+ }
200
+ out = _normalize_grade_payload(parsed)
201
+ out["skipped"] = False
202
+ return out
203
+
204
+
205
+ def _load_rows(path: Path) -> List[Dict[str, str]]:
206
+ with path.open("r", encoding="utf-8", newline="") as handle:
207
+ reader = csv.DictReader(handle)
208
+ return [{str(key): str(value or "") for key, value in row.items()} for row in reader]
209
+
210
+
211
+ def _write_rows(path: Path, rows: List[Dict[str, str]], headers: List[str]) -> None:
212
+ path.parent.mkdir(parents=True, exist_ok=True)
213
+ with path.open("w", encoding="utf-8", newline="") as handle:
214
+ writer = csv.DictWriter(handle, fieldnames=headers)
215
+ writer.writeheader()
216
+ for row in rows:
217
+ writer.writerow({header: row.get(header, "") for header in headers})
218
+
219
+
220
+ def score_csv(
221
+ *,
222
+ in_csv: Path,
223
+ out_csv: Path,
224
+ out_json: Path,
225
+ model: str,
226
+ pass_threshold: float = 80.0,
227
+ safety_threshold: float = 70.0,
228
+ ) -> Dict[str, Any]:
229
+ rows = _load_rows(in_csv)
230
+ needs_scoring = any(str(row.get("response_assistant") or "").strip() for row in rows)
231
+ if needs_scoring:
232
+ if OpenAI is None:
233
+ raise RuntimeError("openai package is not available in this environment.")
234
+ key = str(os.getenv("OPENAI_API_KEY") or "").strip()
235
+ if _is_placeholder_key(key):
236
+ raise RuntimeError("OPENAI_API_KEY is missing or placeholder.")
237
+ client = OpenAI(api_key=key, timeout=60.0) # type: ignore[operator]
238
+ else:
239
+ client = None
240
+ scored_rows: List[Dict[str, str]] = []
241
+ summary_buckets: Dict[str, List[float]] = defaultdict(list)
242
+ family_scores: Dict[str, List[float]] = defaultdict(list)
243
+ scored_count = 0
244
+ skipped_count = 0
245
+
246
+ for row in rows:
247
+ current = dict(row)
248
+ if client is None:
249
+ result = {
250
+ "skipped": True,
251
+ "issues": ["missing_response_assistant"],
252
+ "rationale": "No assistant answer was present in the CSV row.",
253
+ }
254
+ else:
255
+ result = _score_row(current, client=client, model=model)
256
+ current["judge_model"] = model
257
+ current["judge_issues_json"] = json.dumps(result.get("issues") or [], ensure_ascii=False)
258
+ current["judge_rationale"] = str(result.get("rationale") or "")
259
+ if result.get("skipped"):
260
+ skipped_count += 1
261
+ current["judge_pass"] = ""
262
+ current["judge_grade"] = ""
263
+ else:
264
+ scored_count += 1
265
+ for field in ("fact_score", "instruction_score", "coverage_score", "readability_score", "safety_score", "overall_score"):
266
+ current[f"judge_{field}"] = str(result.get(field, ""))
267
+ score_value = _safe_float(result.get(field))
268
+ if score_value is not None:
269
+ summary_buckets[field].append(score_value)
270
+ overall_score = float(result.get("overall_score") or 0.0)
271
+ safety_score = float(result.get("safety_score") or 0.0)
272
+ current["judge_grade"] = str(result.get("grade") or _grade_letter(overall_score))
273
+ current["judge_pass"] = "true" if (overall_score >= pass_threshold and safety_score >= safety_threshold) else "false"
274
+ family_scores[str(current.get("family") or "unknown")].append(overall_score)
275
+ scored_rows.append(current)
276
+
277
+ headers = list(rows[0].keys()) if rows else []
278
+ for field in (
279
+ "judge_model",
280
+ "judge_fact_score",
281
+ "judge_instruction_score",
282
+ "judge_coverage_score",
283
+ "judge_readability_score",
284
+ "judge_safety_score",
285
+ "judge_overall_score",
286
+ "judge_grade",
287
+ "judge_pass",
288
+ "judge_issues_json",
289
+ "judge_rationale",
290
+ ):
291
+ if field not in headers:
292
+ headers.append(field)
293
+ _write_rows(out_csv, scored_rows, headers)
294
+
295
+ average_scores = {
296
+ key: round(sum(values) / max(1, len(values)), 2)
297
+ for key, values in summary_buckets.items()
298
+ if values
299
+ }
300
+ family_averages = {
301
+ family: {
302
+ "avg_overall_score": round(sum(values) / max(1, len(values)), 2),
303
+ "count": len(values),
304
+ }
305
+ for family, values in sorted(family_scores.items())
306
+ if values
307
+ }
308
+ pass_count = sum(1 for row in scored_rows if str(row.get("judge_pass") or "").lower() == "true")
309
+ payload = {
310
+ "generated_at": datetime.now(timezone.utc).isoformat(),
311
+ "input_csv": str(in_csv),
312
+ "output_csv": str(out_csv),
313
+ "model": model,
314
+ "row_count": len(rows),
315
+ "scored_count": scored_count,
316
+ "skipped_count": skipped_count,
317
+ "pass_threshold": pass_threshold,
318
+ "safety_threshold": safety_threshold,
319
+ "pass_count": pass_count,
320
+ "pass_rate": round((pass_count / max(1, scored_count)) * 100.0, 2) if scored_count else 0.0,
321
+ "average_scores": average_scores,
322
+ "families": family_averages,
323
+ }
324
+ out_json.parent.mkdir(parents=True, exist_ok=True)
325
+ out_json.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
326
+ return payload
327
+
328
+
329
+ def main() -> int:
330
+ parser = argparse.ArgumentParser(description="Score router canary A/B response CSVs with OpenAI.")
331
+ parser.add_argument("--in-csv", required=True, help="CSV containing captured answers.")
332
+ parser.add_argument("--out-csv", default="", help="Scored CSV output path. Defaults next to the input.")
333
+ parser.add_argument("--out-json", default="", help="Summary JSON output path. Defaults next to the input.")
334
+ parser.add_argument(
335
+ "--model",
336
+ default=os.getenv("ROUTER_CANARY_AB_EVAL_MODEL", os.getenv("UNIFIED_KB_EVAL_SEMANTIC_MODEL", os.getenv("OPENAI_MODEL", "gpt-5-mini"))),
337
+ help="OpenAI model used for grading.",
338
+ )
339
+ parser.add_argument("--pass-threshold", type=float, default=80.0)
340
+ parser.add_argument("--safety-threshold", type=float, default=70.0)
341
+ args = parser.parse_args()
342
+
343
+ in_csv = Path(args.in_csv).resolve()
344
+ out_csv = Path(args.out_csv).resolve() if args.out_csv else in_csv.with_name(f"{in_csv.stem}_scored.csv")
345
+ out_json = Path(args.out_json).resolve() if args.out_json else in_csv.with_name(f"{in_csv.stem}_scored_summary.json")
346
+
347
+ summary = score_csv(
348
+ in_csv=in_csv,
349
+ out_csv=out_csv,
350
+ out_json=out_json,
351
+ model=str(args.model or "gpt-5-mini"),
352
+ pass_threshold=float(args.pass_threshold or 80.0),
353
+ safety_threshold=float(args.safety_threshold or 70.0),
354
+ )
355
+ print(
356
+ f"Scored {summary['scored_count']} rows from {summary['input_csv']} "
357
+ f"with pass rate {summary['pass_rate']}% using {summary['model']}"
358
+ )
359
+ return 0
360
+
361
+
362
+ if __name__ == "__main__":
363
+ raise SystemExit(main())
backend/scripts/validate_hosted_runtime.py CHANGED
@@ -97,6 +97,11 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
97
  health_ok: Optional[bool] = None
98
  health_status_code: Optional[int] = None
99
  health_access = "ok"
 
 
 
 
 
100
  try:
101
  health = _fetch_json(args.base_url, "/api/health", args.timeout_s)
102
  health_ok = bool(health.get("ok", False))
@@ -111,6 +116,30 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
111
  raise
112
  auth = health.get("auth") if isinstance(health.get("auth"), dict) else {}
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  build_version = str(build_info.get("build_version") or "").strip()
115
  git_sha = str(build_info.get("git_sha") or "").strip()
116
  startup_integrity_ok = bool(build_info.get("startup_integrity_ok", False))
@@ -144,6 +173,8 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
144
  failures.append(
145
  f"Hosted auth_enabled mismatch: expected {args.expect_auth_enabled}, got {auth_enabled}."
146
  )
 
 
147
  if audience in FORBIDDEN_AUDIENCE_VALUES:
148
  failures.append(f"Hosted auth audience still resolves to removed placeholder '{audience}'.")
149
  if expected_origin:
@@ -183,6 +214,11 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
183
  "health_ok": health_ok,
184
  "health_access": health_access,
185
  "health_status_code": health_status_code,
 
 
 
 
 
186
  "auth_required": auth_required,
187
  "auth_enabled": auth_enabled,
188
  "auth_audience": audience,
@@ -204,12 +240,18 @@ def main() -> int:
204
  parser.add_argument("--expected-git-sha", default="", help="Expected hosted git_sha.")
205
  parser.add_argument("--expect-auth-required", default="true", help="Expected hosted auth_required value.")
206
  parser.add_argument("--expect-auth-enabled", default="true", help="Expected hosted auth_enabled value.")
 
 
 
 
 
207
  parser.add_argument("--timeout-s", type=float, default=20.0, help="HTTP timeout per request.")
208
  parser.add_argument("--out", default="", help="Optional output JSON path.")
209
  args = parser.parse_args()
210
 
211
  args.expect_auth_required = _parse_bool(args.expect_auth_required)
212
  args.expect_auth_enabled = _parse_bool(args.expect_auth_enabled)
 
213
 
214
  try:
215
  report = _build_report(args)
 
97
  health_ok: Optional[bool] = None
98
  health_status_code: Optional[int] = None
99
  health_access = "ok"
100
+ router_catalog_status: Dict[str, Any] = {}
101
+ router_catalog_loaded: Optional[bool] = None
102
+ router_catalog_product_count: Optional[int] = None
103
+ router_catalog_access = "not_checked"
104
+ router_catalog_status_code: Optional[int] = None
105
  try:
106
  health = _fetch_json(args.base_url, "/api/health", args.timeout_s)
107
  health_ok = bool(health.get("ok", False))
 
116
  raise
117
  auth = health.get("auth") if isinstance(health.get("auth"), dict) else {}
118
 
119
+ if args.require_router_workbook_loaded:
120
+ router_catalog_access = "ok"
121
+ try:
122
+ router_catalog_status = _fetch_json(args.base_url, "/api/rapid_router/catalog/status", args.timeout_s)
123
+ catalog = router_catalog_status.get("catalog") if isinstance(router_catalog_status.get("catalog"), dict) else {}
124
+ router_catalog_loaded = bool(catalog.get("loaded", False))
125
+ try:
126
+ router_catalog_product_count = int(catalog.get("product_count")) if catalog.get("product_count") is not None else None
127
+ except Exception:
128
+ router_catalog_product_count = None
129
+ except HTTPError as exc:
130
+ router_catalog_status_code = int(exc.code)
131
+ router_catalog_access = "protected" if router_catalog_status_code in {401, 403} else "error"
132
+ failures.append(
133
+ "Hosted /api/rapid_router/catalog/status could not be validated"
134
+ f" (HTTP {router_catalog_status_code})."
135
+ )
136
+ except (URLError, TimeoutError, json.JSONDecodeError) as exc:
137
+ router_catalog_access = "error"
138
+ failures.append(
139
+ "Hosted /api/rapid_router/catalog/status could not be validated"
140
+ f" ({type(exc).__name__}: {exc})."
141
+ )
142
+
143
  build_version = str(build_info.get("build_version") or "").strip()
144
  git_sha = str(build_info.get("git_sha") or "").strip()
145
  startup_integrity_ok = bool(build_info.get("startup_integrity_ok", False))
 
173
  failures.append(
174
  f"Hosted auth_enabled mismatch: expected {args.expect_auth_enabled}, got {auth_enabled}."
175
  )
176
+ if args.require_router_workbook_loaded and router_catalog_loaded is False:
177
+ failures.append("Hosted router workbook catalog is not loaded.")
178
  if audience in FORBIDDEN_AUDIENCE_VALUES:
179
  failures.append(f"Hosted auth audience still resolves to removed placeholder '{audience}'.")
180
  if expected_origin:
 
214
  "health_ok": health_ok,
215
  "health_access": health_access,
216
  "health_status_code": health_status_code,
217
+ "router_catalog_loaded": router_catalog_loaded,
218
+ "router_catalog_product_count": router_catalog_product_count,
219
+ "router_catalog_access": router_catalog_access,
220
+ "router_catalog_status_code": router_catalog_status_code,
221
+ "router_catalog_status": router_catalog_status,
222
  "auth_required": auth_required,
223
  "auth_enabled": auth_enabled,
224
  "auth_audience": audience,
 
240
  parser.add_argument("--expected-git-sha", default="", help="Expected hosted git_sha.")
241
  parser.add_argument("--expect-auth-required", default="true", help="Expected hosted auth_required value.")
242
  parser.add_argument("--expect-auth-enabled", default="true", help="Expected hosted auth_enabled value.")
243
+ parser.add_argument(
244
+ "--require-router-workbook-loaded",
245
+ default="false",
246
+ help="Whether /api/rapid_router/catalog/status must report catalog.loaded=true.",
247
+ )
248
  parser.add_argument("--timeout-s", type=float, default=20.0, help="HTTP timeout per request.")
249
  parser.add_argument("--out", default="", help="Optional output JSON path.")
250
  args = parser.parse_args()
251
 
252
  args.expect_auth_required = _parse_bool(args.expect_auth_required)
253
  args.expect_auth_enabled = _parse_bool(args.expect_auth_enabled)
254
+ args.require_router_workbook_loaded = _parse_bool(args.require_router_workbook_loaded)
255
 
256
  try:
257
  report = _build_report(args)
frontend/scripts/run-hosted-smoke.sh CHANGED
@@ -4,6 +4,8 @@ set -euo pipefail
4
 
5
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
  FRONTEND_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
 
 
7
  ENV_FILE="${E2E_ENV_FILE:-${FRONTEND_DIR}/.env.e2e}"
8
  PROD_BASE_URL="${E2E_PROD_BASE_URL:-https://crazycrazypete-masters-four-tab-openai.hf.space}"
9
  CANARY_BASE_URL="${E2E_CANARY_BASE_URL:-https://crazycrazypete-masters-four-tab-openai-canary.hf.space}"
@@ -12,6 +14,7 @@ PROD_POTS_WORKSPACE_EXPECTATION="${E2E_PROD_POTS_WORKSPACE_EXPECTATION:-project-
12
  CANARY_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_POTS_WORKSPACE_EXPECTATION:-project-shell}"
13
  CANARY_AB_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_AB_POTS_WORKSPACE_EXPECTATION:-project-shell}"
14
  TARGETS="${E2E_SMOKE_TARGETS:-production canary canary-ab}"
 
15
 
16
  if [[ ! -f "${ENV_FILE}" ]]; then
17
  echo "Missing env file: ${ENV_FILE}" >&2
@@ -27,6 +30,12 @@ run_target() {
27
  local pots_workspace_expectation="$3"
28
 
29
  echo "==> Hosted smoke: ${label} (${base_url}) [pots-workspace=${pots_workspace_expectation}]"
 
 
 
 
 
 
30
  E2E_BASE_URL="${base_url}" E2E_POTS_WORKSPACE_EXPECTATION="${pots_workspace_expectation}" npx playwright test \
31
  e2e/auth.full-flow.spec.ts \
32
  e2e/pots.provider-coverage.spec.ts \
 
4
 
5
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6
  FRONTEND_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
7
+ REPO_ROOT="$(cd "${FRONTEND_DIR}/.." && pwd)"
8
+ VALIDATOR="${REPO_ROOT}/backend/scripts/validate_hosted_runtime.py"
9
  ENV_FILE="${E2E_ENV_FILE:-${FRONTEND_DIR}/.env.e2e}"
10
  PROD_BASE_URL="${E2E_PROD_BASE_URL:-https://crazycrazypete-masters-four-tab-openai.hf.space}"
11
  CANARY_BASE_URL="${E2E_CANARY_BASE_URL:-https://crazycrazypete-masters-four-tab-openai-canary.hf.space}"
 
14
  CANARY_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_POTS_WORKSPACE_EXPECTATION:-project-shell}"
15
  CANARY_AB_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_AB_POTS_WORKSPACE_EXPECTATION:-project-shell}"
16
  TARGETS="${E2E_SMOKE_TARGETS:-production canary canary-ab}"
17
+ VALIDATE_TIMEOUT_S="${E2E_HOSTED_VALIDATE_TIMEOUT_S:-30}"
18
 
19
  if [[ ! -f "${ENV_FILE}" ]]; then
20
  echo "Missing env file: ${ENV_FILE}" >&2
 
30
  local pots_workspace_expectation="$3"
31
 
32
  echo "==> Hosted smoke: ${label} (${base_url}) [pots-workspace=${pots_workspace_expectation}]"
33
+ python3 "${VALIDATOR}" \
34
+ --base-url "${base_url}" \
35
+ --expect-auth-required true \
36
+ --expect-auth-enabled true \
37
+ --require-router-workbook-loaded true \
38
+ --timeout-s "${VALIDATE_TIMEOUT_S}"
39
  E2E_BASE_URL="${base_url}" E2E_POTS_WORKSPACE_EXPECTATION="${pots_workspace_expectation}" npx playwright test \
40
  e2e/auth.full-flow.spec.ts \
41
  e2e/pots.provider-coverage.spec.ts \