Pete Dunn commited on
Commit ·
ddae266
1
Parent(s): 640f404
Harden router canary eval preflight and survey runs
Browse files- .github/workflows/deploy-hf-gated.yml +2 -0
- backend/app/knowledgebase/core.py +247 -18
- backend/app/main.py +25 -2
- backend/app/test_rapid_router_catalog_bootstrap.py +24 -0
- backend/app/test_score_router_canary_ab_responses.py +82 -0
- backend/app/test_unified_kb_core.py +95 -0
- backend/app/test_unified_kb_router_workbook.py +91 -0
- backend/app/test_validate_hosted_runtime.py +124 -1
- backend/scripts/run_router_canary_ab_eval_shard.py +657 -0
- backend/scripts/score_router_canary_ab_responses.py +363 -0
- backend/scripts/validate_hosted_runtime.py +42 -0
- frontend/scripts/run-hosted-smoke.sh +9 -0
.github/workflows/deploy-hf-gated.yml
CHANGED
|
@@ -379,6 +379,7 @@ jobs:
|
|
| 379 |
--expected-git-sha "${EXPECTED_SHA}" \
|
| 380 |
--expect-auth-required true \
|
| 381 |
--expect-auth-enabled true \
|
|
|
|
| 382 |
--out "docs/evals/canary_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"
|
| 383 |
|
| 384 |
deploy-production:
|
|
@@ -514,4 +515,5 @@ jobs:
|
|
| 514 |
--expected-git-sha "${EXPECTED_SHA}" \
|
| 515 |
--expect-auth-required true \
|
| 516 |
--expect-auth-enabled true \
|
|
|
|
| 517 |
--out "docs/evals/production_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"
|
|
|
|
| 379 |
--expected-git-sha "${EXPECTED_SHA}" \
|
| 380 |
--expect-auth-required true \
|
| 381 |
--expect-auth-enabled true \
|
| 382 |
+
--require-router-workbook-loaded true \
|
| 383 |
--out "docs/evals/canary_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"
|
| 384 |
|
| 385 |
deploy-production:
|
|
|
|
| 515 |
--expected-git-sha "${EXPECTED_SHA}" \
|
| 516 |
--expect-auth-required true \
|
| 517 |
--expect-auth-enabled true \
|
| 518 |
+
--require-router-workbook-loaded true \
|
| 519 |
--out "docs/evals/production_runtime_validation_${{ github.run_id }}_${{ github.run_attempt }}.json"
|
backend/app/knowledgebase/core.py
CHANGED
|
@@ -1695,6 +1695,52 @@ def _looks_like_router_lifecycle(message: str) -> bool:
|
|
| 1695 |
return False
|
| 1696 |
|
| 1697 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1698 |
def _looks_like_pots(message: str) -> bool:
|
| 1699 |
low = _normalize_router_query_text(message)
|
| 1700 |
if _contains_any(low, _POTS_HINTS):
|
|
@@ -8811,8 +8857,20 @@ class UnifiedKnowledgebaseCore:
|
|
| 8811 |
continue
|
| 8812 |
seen_requested_compacts.add(compact)
|
| 8813 |
requested_compare_labels.append(label)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8814 |
asks_install_caveats = any(h in low for h in ("install caveat", "install caveats"))
|
| 8815 |
if asks_install_caveats:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8816 |
compare_field_labels = {
|
| 8817 |
"wan_lan": "WAN/LAN ports",
|
| 8818 |
"antennas_rf": "RF connectors",
|
|
@@ -8842,13 +8900,108 @@ class UnifiedKnowledgebaseCore:
|
|
| 8842 |
elif "install_caveats" not in requested_compare_fields:
|
| 8843 |
requested_compare_fields.append("install_caveats")
|
| 8844 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8845 |
def _catalog_compare_value(row: Dict[str, Any], field: str) -> str:
|
| 8846 |
raw = _norm(row.get(field, ""))
|
| 8847 |
if (not raw) and field == "install_caveats":
|
| 8848 |
raw = _norm(row.get("special notes", ""))
|
| 8849 |
-
|
| 8850 |
-
return "Not clearly documented"
|
| 8851 |
-
return _truncate(_fix_common_mojibake(raw), 170)
|
| 8852 |
|
| 8853 |
def _catalog_install_compare_table() -> Optional[Dict[str, Any]]:
|
| 8854 |
doc_rows: List[Dict[str, str]] = []
|
|
@@ -9140,14 +9293,6 @@ class UnifiedKnowledgebaseCore:
|
|
| 9140 |
if catalog_table is not None:
|
| 9141 |
return catalog_table
|
| 9142 |
return None
|
| 9143 |
-
wants_doc_matrix = (
|
| 9144 |
-
("documented" in low and "not documented" in low)
|
| 9145 |
-
or ("include what is documented" in low)
|
| 9146 |
-
or ("what is documented vs not documented" in low)
|
| 9147 |
-
)
|
| 9148 |
-
wants_docs_only = wants_doc_matrix or any(
|
| 9149 |
-
h in low for h in ("from documented specs only", "documented specs only", "from docs only", "docs only")
|
| 9150 |
-
)
|
| 9151 |
deterministic_doc_matrix_supported = bool(
|
| 9152 |
wants_doc_matrix
|
| 9153 |
and len(dedup_models) >= 2
|
|
@@ -9185,6 +9330,31 @@ class UnifiedKnowledgebaseCore:
|
|
| 9185 |
if not value:
|
| 9186 |
return "Not clearly documented"
|
| 9187 |
low_value = value.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9188 |
if any(token in low_value for token in ("not listed", "abstained", "unknown", "csv conflict", "(blank)")):
|
| 9189 |
return "Not clearly documented"
|
| 9190 |
if field_name == "wan_lan":
|
|
@@ -9206,6 +9376,8 @@ class UnifiedKnowledgebaseCore:
|
|
| 9206 |
return "Not clearly documented"
|
| 9207 |
return _truncate(re.sub(r"\s+", " ", value).strip(), 140)
|
| 9208 |
if field_name == "antennas_rf":
|
|
|
|
|
|
|
| 9209 |
if "rf:" in value.lower():
|
| 9210 |
value = value.split("RF:", 1)[-1]
|
| 9211 |
connector_match = re.search(
|
|
@@ -9237,6 +9409,8 @@ class UnifiedKnowledgebaseCore:
|
|
| 9237 |
return "Not clearly documented"
|
| 9238 |
return _truncate(value, 120)
|
| 9239 |
if field_name == "modem":
|
|
|
|
|
|
|
| 9240 |
modem_match = re.search(
|
| 9241 |
r"\b(?:5g(?:\s*nr)?(?:\s*sa)?(?:\s*/\s*nsa)?|4g(?:\s*lte)?(?:\s*cat\s*\d+)?|lte\s*cat\s*\d+)\b[^.;,]{0,50}",
|
| 9242 |
value,
|
|
@@ -12488,6 +12662,51 @@ class UnifiedKnowledgebaseCore:
|
|
| 12488 |
return "Listed, but family not explicit"
|
| 12489 |
return "Needs connector validation"
|
| 12490 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12491 |
def _vehicle_compare_reason(row: Dict[str, Any]) -> str:
|
| 12492 |
use_case = _norm(row.get("primary_use_case", ""))
|
| 12493 |
rugged = _norm(row.get("ruggedization", ""))
|
|
@@ -12663,7 +12882,7 @@ class UnifiedKnowledgebaseCore:
|
|
| 12663 |
sources: List[Dict[str, Any]] = []
|
| 12664 |
for idx, (display, row, antenna_family, why_fit) in enumerate(compare_rows, start=1):
|
| 12665 |
wan_lan = _norm(row.get("wan_lan", "")) or "Not listed"
|
| 12666 |
-
rf =
|
| 12667 |
lines.append(
|
| 12668 |
f"| {_md_cell(display)} | {_md_cell(_norm(row.get('primary_use_case', '')) or 'Vehicle/mobile signal reviewed')} "
|
| 12669 |
f"| {_md_cell(wan_lan)} | {_md_cell(rf)} | {_md_cell(antenna_family)} | {_md_cell(why_fit)} |"
|
|
@@ -12810,7 +13029,7 @@ class UnifiedKnowledgebaseCore:
|
|
| 12810 |
modem = _norm(row.get("modem", "")) or "Not listed (abstained)"
|
| 12811 |
rugged = _norm(row.get("ruggedization", "")) or "Not listed (abstained)"
|
| 12812 |
battery = _norm(row.get("battery", "")) or "Not listed (abstained)"
|
| 12813 |
-
rf =
|
| 12814 |
lines.append(
|
| 12815 |
f"| {idx} | {_md_cell(model_name)} | {_md_cell(modem)} | {_md_cell(rugged)} | "
|
| 12816 |
f"{_md_cell(battery)} | {_md_cell(rf)} | {_md_cell(note)} |"
|
|
@@ -29329,9 +29548,19 @@ class UnifiedKnowledgebaseCore:
|
|
| 29329 |
]
|
| 29330 |
ranking_rows: List[Tuple[int, int, str]] = []
|
| 29331 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29332 |
def _fleet_priority(item: Dict[str, Any], matched: Dict[str, Any]) -> Tuple[int, str]:
|
| 29333 |
qty = int(item.get("qty") or 0)
|
| 29334 |
-
router_name =
|
| 29335 |
if not matched:
|
| 29336 |
return 150 + min(qty, 25), (
|
| 29337 |
f"`{router_name}` needs an exact workbook model match before I can rank its migration path safely."
|
|
@@ -29373,7 +29602,7 @@ class UnifiedKnowledgebaseCore:
|
|
| 29373 |
reasons.append("authoritative lifecycle dates are incomplete")
|
| 29374 |
|
| 29375 |
reason_text = ", and ".join(reasons[:3])
|
| 29376 |
-
return score, f"`{
|
| 29377 |
|
| 29378 |
asks_ranked_output = any(
|
| 29379 |
token in query.normalized_message
|
|
@@ -29393,14 +29622,14 @@ class UnifiedKnowledgebaseCore:
|
|
| 29393 |
status = _status_label(match, lifecycle)
|
| 29394 |
eos = lifecycle.get("end_of_sale_date") or "Not listed"
|
| 29395 |
eol = lifecycle.get("last_support_date") or "Not listed"
|
| 29396 |
-
router_name =
|
| 29397 |
else:
|
| 29398 |
same_brand = "Needs exact workbook match"
|
| 29399 |
backup = "Needs exact workbook match"
|
| 29400 |
status = "Needs exact workbook match"
|
| 29401 |
eos = "Not listed"
|
| 29402 |
eol = "Not listed"
|
| 29403 |
-
router_name =
|
| 29404 |
rows.append(
|
| 29405 |
"| "
|
| 29406 |
+ " | ".join(
|
|
@@ -31813,7 +32042,7 @@ class UnifiedKnowledgebaseCore:
|
|
| 31813 |
blocked["meta"] = blocked_meta
|
| 31814 |
return blocked
|
| 31815 |
lifecycle_policy_text = _scrub_router_model_tokens_for_policy(msg)
|
| 31816 |
-
if _EXACT_LIFECYCLE_RE.search(lifecycle_policy_text):
|
| 31817 |
blocked = self._policy_block_response("exact_lifecycle", st)
|
| 31818 |
blocked_meta = _as_dict(blocked.get("meta"))
|
| 31819 |
blocked_meta["timing_ms"] = {"total": round((time.perf_counter() - t_total) * 1000.0, 2)}
|
|
|
|
| 1695 |
return False
|
| 1696 |
|
| 1697 |
|
| 1698 |
+
def _is_supported_router_mixed_lifecycle_request(message: str) -> bool:
|
| 1699 |
+
normalized = _normalize_router_query_text(message)
|
| 1700 |
+
query = parse_router_intelligence_query(message)
|
| 1701 |
+
if query is None or query.intent not in {"details", "compare"}:
|
| 1702 |
+
return False
|
| 1703 |
+
exact_date_tokens = (
|
| 1704 |
+
"exact lifecycle date",
|
| 1705 |
+
"lifecycle date",
|
| 1706 |
+
"end of sale date",
|
| 1707 |
+
"end of life date",
|
| 1708 |
+
"eos date",
|
| 1709 |
+
"eol date",
|
| 1710 |
+
"what exact date",
|
| 1711 |
+
)
|
| 1712 |
+
if any(token in normalized for token in exact_date_tokens):
|
| 1713 |
+
return False
|
| 1714 |
+
safe_field_tokens = (
|
| 1715 |
+
"primary use case",
|
| 1716 |
+
"use case",
|
| 1717 |
+
"wan/lan",
|
| 1718 |
+
"wan",
|
| 1719 |
+
"lan",
|
| 1720 |
+
"ethernet",
|
| 1721 |
+
"ports",
|
| 1722 |
+
"wi-fi",
|
| 1723 |
+
"wifi",
|
| 1724 |
+
"modem",
|
| 1725 |
+
"rf connector",
|
| 1726 |
+
"rf connectors",
|
| 1727 |
+
"connectors",
|
| 1728 |
+
"install caveat",
|
| 1729 |
+
"install caveats",
|
| 1730 |
+
"current recommendation",
|
| 1731 |
+
"still current recommendation",
|
| 1732 |
+
"status",
|
| 1733 |
+
"documented vs",
|
| 1734 |
+
"not documented",
|
| 1735 |
+
"compare",
|
| 1736 |
+
"difference",
|
| 1737 |
+
"different",
|
| 1738 |
+
"versus",
|
| 1739 |
+
" vs ",
|
| 1740 |
+
)
|
| 1741 |
+
return any(token in normalized for token in safe_field_tokens)
|
| 1742 |
+
|
| 1743 |
+
|
| 1744 |
def _looks_like_pots(message: str) -> bool:
|
| 1745 |
low = _normalize_router_query_text(message)
|
| 1746 |
if _contains_any(low, _POTS_HINTS):
|
|
|
|
| 8857 |
continue
|
| 8858 |
seen_requested_compacts.add(compact)
|
| 8859 |
requested_compare_labels.append(label)
|
| 8860 |
+
wants_doc_matrix = (
|
| 8861 |
+
("documented" in low and "not documented" in low)
|
| 8862 |
+
or ("include what is documented" in low)
|
| 8863 |
+
or ("what is documented vs not documented" in low)
|
| 8864 |
+
)
|
| 8865 |
+
wants_docs_only = wants_doc_matrix or any(
|
| 8866 |
+
h in low for h in ("from documented specs only", "documented specs only", "from docs only", "docs only")
|
| 8867 |
+
)
|
| 8868 |
asks_install_caveats = any(h in low for h in ("install caveat", "install caveats"))
|
| 8869 |
if asks_install_caveats:
|
| 8870 |
+
# This helper only runs in the router_docs lane, so install-caveat
|
| 8871 |
+
# compare prompts should still stay source-bounded even when the
|
| 8872 |
+
# user omits an explicit "docs only" phrase.
|
| 8873 |
+
wants_docs_only = True
|
| 8874 |
compare_field_labels = {
|
| 8875 |
"wan_lan": "WAN/LAN ports",
|
| 8876 |
"antennas_rf": "RF connectors",
|
|
|
|
| 8900 |
elif "install_caveats" not in requested_compare_fields:
|
| 8901 |
requested_compare_fields.append("install_caveats")
|
| 8902 |
|
| 8903 |
+
variant_ambiguity_markers = (
|
| 8904 |
+
"vary by sku",
|
| 8905 |
+
"varies by sku",
|
| 8906 |
+
"depends on package",
|
| 8907 |
+
"depends on package or accessory",
|
| 8908 |
+
"exact modem depends",
|
| 8909 |
+
"package or accessory",
|
| 8910 |
+
"supported modem option",
|
| 8911 |
+
"supported modem options",
|
| 8912 |
+
"supported modem sku",
|
| 8913 |
+
"supported modem skus",
|
| 8914 |
+
"modem-equipped variant",
|
| 8915 |
+
"modem-equipped variants",
|
| 8916 |
+
"no-modem",
|
| 8917 |
+
)
|
| 8918 |
+
|
| 8919 |
+
def _has_variant_ambiguity(value: str) -> bool:
|
| 8920 |
+
low_value = value.lower()
|
| 8921 |
+
return any(marker in low_value for marker in variant_ambiguity_markers)
|
| 8922 |
+
|
| 8923 |
+
def _sanitize_catalog_install_compare_value(field: str, raw_value: Any) -> str:
|
| 8924 |
+
value = _fix_common_mojibake(_norm(raw_value))
|
| 8925 |
+
if not value:
|
| 8926 |
+
return "Not clearly documented"
|
| 8927 |
+
low_value = value.lower()
|
| 8928 |
+
if field == "wan_lan":
|
| 8929 |
+
port_match = re.search(
|
| 8930 |
+
r"\bfive ethernet ports?\b|\bdual ethernet ports?\b|\b\d+\s*(?:x|\*)\s*(?:10/100/1000|10/100|100mbit/s|100mbps|1gbe|2\.5gbe|ge)\s*(?:rj45\s*)?(?:ethernet|network)\s*ports?\b(?:[^.;]{0,40}\b(?:wan|lan|vlan)\b[^.;]{0,40})?",
|
| 8931 |
+
value,
|
| 8932 |
+
flags=re.IGNORECASE,
|
| 8933 |
+
)
|
| 8934 |
+
if port_match:
|
| 8935 |
+
value = port_match.group(0)
|
| 8936 |
+
low_value = value.lower()
|
| 8937 |
+
has_port_signal = bool(
|
| 8938 |
+
re.search(r"\b(?:wan|lan|ethernet|rj45|sfp\+?)\b", low_value)
|
| 8939 |
+
or re.search(r"\b\d+\s*x\b", low_value)
|
| 8940 |
+
or re.search(r"\b\d+(?:\.\d+)?\s*(?:gbe?|mbps|gbps)\b", low_value)
|
| 8941 |
+
or re.search(r"\b(single|dual|triple|quad|five)\b", low_value)
|
| 8942 |
+
)
|
| 8943 |
+
if not has_port_signal:
|
| 8944 |
+
return "Not clearly documented"
|
| 8945 |
+
return _truncate(re.sub(r"\s+", " ", value).strip(), 140)
|
| 8946 |
+
if field == "antennas_rf":
|
| 8947 |
+
if _has_variant_ambiguity(value):
|
| 8948 |
+
return "Needs exact SKU/package; connector path varies across documented family variants."
|
| 8949 |
+
connector_match = re.search(
|
| 8950 |
+
r"(?:\d+\s*x\s*)?(?:rp-?sma|sma)\b[^.;]{0,80}|antenna connectors?[^.;]{0,100}|(?:gps|gnss)\b[^.;]{0,80}",
|
| 8951 |
+
value,
|
| 8952 |
+
flags=re.IGNORECASE,
|
| 8953 |
+
)
|
| 8954 |
+
if not connector_match:
|
| 8955 |
+
return "Not clearly documented"
|
| 8956 |
+
value = connector_match.group(0)
|
| 8957 |
+
value = re.sub(r"^(?:both|external|internal)\s*\([^)]*\)\s*;?\s*", "", value, flags=re.IGNORECASE)
|
| 8958 |
+
value = re.sub(r"\([^)]*(?:depends on|if present|verify [^)]+|see [^)]+ docs)[^)]*\)", "", value, flags=re.IGNORECASE)
|
| 8959 |
+
value = re.sub(r"[.;]\s*Adapter pigtails?:[^.;]*", "", value, flags=re.IGNORECASE)
|
| 8960 |
+
value = re.sub(r"[.;]\s*connectors likely[^.;]*", "", value, flags=re.IGNORECASE)
|
| 8961 |
+
value = re.sub(r"[.;]\s*verify connector type[^.;]*", "", value, flags=re.IGNORECASE)
|
| 8962 |
+
value = re.sub(r"[.;]\s*(SIM|Ethernet):[^.;]*", "", value, flags=re.IGNORECASE)
|
| 8963 |
+
value = re.sub(r"\s+", " ", value).strip(" ;.")
|
| 8964 |
+
low_value = value.lower()
|
| 8965 |
+
if (not value) or (not any(token in low_value for token in ("sma", "rp-sma", "connector", "gnss", "gps"))):
|
| 8966 |
+
return "Not clearly documented"
|
| 8967 |
+
return _truncate(value, 140)
|
| 8968 |
+
if field == "modem":
|
| 8969 |
+
if _has_variant_ambiguity(value):
|
| 8970 |
+
return "Needs exact SKU/package; modem path varies across documented family variants."
|
| 8971 |
+
modem_match = re.search(
|
| 8972 |
+
r"\b(?:5g(?:\s*nr)?(?:\s*sa)?(?:\s*/\s*nsa)?|4g(?:\s*lte)?(?:\s*cat\s*\d+)?|lte\s*cat\s*\d+)\b[^.;,]{0,50}",
|
| 8973 |
+
value,
|
| 8974 |
+
flags=re.IGNORECASE,
|
| 8975 |
+
)
|
| 8976 |
+
if not modem_match:
|
| 8977 |
+
return "Not clearly documented"
|
| 8978 |
+
value = modem_match.group(0)
|
| 8979 |
+
low_value = value.lower()
|
| 8980 |
+
if low_value in {"modem", "modems", "cellular", "cellular modem"}:
|
| 8981 |
+
return "Not clearly documented"
|
| 8982 |
+
if any(token in low_value for token in ("ethernet", "wan", "lan", "cloud management", "ports", "wi-fi", "vpn", "connectivity", "secure")):
|
| 8983 |
+
return "Not clearly documented"
|
| 8984 |
+
return _truncate(value, 120)
|
| 8985 |
+
if field == "wifi":
|
| 8986 |
+
if any(token in low_value for token in ("none", "no wi-fi", "no wifi", "without wi-fi", "without wifi")):
|
| 8987 |
+
return "None"
|
| 8988 |
+
if ("802.11" not in low_value) and (not re.search(r"\bwi-?fi\s*[4567]\b", low_value)):
|
| 8989 |
+
return "Not clearly documented"
|
| 8990 |
+
return _truncate(value, 120)
|
| 8991 |
+
if field == "battery":
|
| 8992 |
+
if low_value == "none":
|
| 8993 |
+
return "None"
|
| 8994 |
+
return _truncate(value, 120)
|
| 8995 |
+
if field == "install_caveats":
|
| 8996 |
+
if _has_variant_ambiguity(value) or any(token in low_value for token in ("verify exact", "exact sku", "exact modem", "exact package")):
|
| 8997 |
+
return "Verify exact SKU/package before finalizing modem, RF, or accessory assumptions."
|
| 8998 |
+
return _truncate(value, 170)
|
| 8999 |
+
|
| 9000 |
def _catalog_compare_value(row: Dict[str, Any], field: str) -> str:
|
| 9001 |
raw = _norm(row.get(field, ""))
|
| 9002 |
if (not raw) and field == "install_caveats":
|
| 9003 |
raw = _norm(row.get("special notes", ""))
|
| 9004 |
+
return _sanitize_catalog_install_compare_value(field, raw)
|
|
|
|
|
|
|
| 9005 |
|
| 9006 |
def _catalog_install_compare_table() -> Optional[Dict[str, Any]]:
|
| 9007 |
doc_rows: List[Dict[str, str]] = []
|
|
|
|
| 9293 |
if catalog_table is not None:
|
| 9294 |
return catalog_table
|
| 9295 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9296 |
deterministic_doc_matrix_supported = bool(
|
| 9297 |
wants_doc_matrix
|
| 9298 |
and len(dedup_models) >= 2
|
|
|
|
| 9330 |
if not value:
|
| 9331 |
return "Not clearly documented"
|
| 9332 |
low_value = value.lower()
|
| 9333 |
+
variant_ambiguity_markers = (
|
| 9334 |
+
"variant",
|
| 9335 |
+
"variants",
|
| 9336 |
+
"exact sku",
|
| 9337 |
+
"exact package",
|
| 9338 |
+
"exact modem",
|
| 9339 |
+
"modem option",
|
| 9340 |
+
"modem options",
|
| 9341 |
+
"sku/package",
|
| 9342 |
+
"sku package",
|
| 9343 |
+
"depends on sku",
|
| 9344 |
+
"depends on package",
|
| 9345 |
+
"see model-specific docs",
|
| 9346 |
+
"see exact sku docs",
|
| 9347 |
+
"supported modem sku",
|
| 9348 |
+
"supported modem skus",
|
| 9349 |
+
"modem-equipped variant",
|
| 9350 |
+
"modem-equipped variants",
|
| 9351 |
+
"no-modem",
|
| 9352 |
+
)
|
| 9353 |
+
|
| 9354 |
+
def _has_variant_ambiguity_local(text: str) -> bool:
|
| 9355 |
+
local_low = text.lower()
|
| 9356 |
+
return any(marker in local_low for marker in variant_ambiguity_markers)
|
| 9357 |
+
|
| 9358 |
if any(token in low_value for token in ("not listed", "abstained", "unknown", "csv conflict", "(blank)")):
|
| 9359 |
return "Not clearly documented"
|
| 9360 |
if field_name == "wan_lan":
|
|
|
|
| 9376 |
return "Not clearly documented"
|
| 9377 |
return _truncate(re.sub(r"\s+", " ", value).strip(), 140)
|
| 9378 |
if field_name == "antennas_rf":
|
| 9379 |
+
if _has_variant_ambiguity_local(value):
|
| 9380 |
+
return "Needs exact SKU/package; connector path varies across documented family variants."
|
| 9381 |
if "rf:" in value.lower():
|
| 9382 |
value = value.split("RF:", 1)[-1]
|
| 9383 |
connector_match = re.search(
|
|
|
|
| 9409 |
return "Not clearly documented"
|
| 9410 |
return _truncate(value, 120)
|
| 9411 |
if field_name == "modem":
|
| 9412 |
+
if _has_variant_ambiguity_local(value):
|
| 9413 |
+
return "Needs exact SKU/package; modem path varies across documented family variants."
|
| 9414 |
modem_match = re.search(
|
| 9415 |
r"\b(?:5g(?:\s*nr)?(?:\s*sa)?(?:\s*/\s*nsa)?|4g(?:\s*lte)?(?:\s*cat\s*\d+)?|lte\s*cat\s*\d+)\b[^.;,]{0,50}",
|
| 9416 |
value,
|
|
|
|
| 12662 |
return "Listed, but family not explicit"
|
| 12663 |
return "Needs connector validation"
|
| 12664 |
|
| 12665 |
+
def _vehicle_compare_connector_summary(row: Dict[str, Any]) -> str:
|
| 12666 |
+
value = _fix_common_mojibake(_norm(row.get("antennas_rf", "")))
|
| 12667 |
+
if not value:
|
| 12668 |
+
return "Not clearly documented"
|
| 12669 |
+
low_value = value.lower()
|
| 12670 |
+
clauses: List[str] = []
|
| 12671 |
+
|
| 12672 |
+
def _remember(text: str) -> None:
|
| 12673 |
+
cleaned = re.sub(r"\s+", " ", _norm(text)).strip(" ;,.")
|
| 12674 |
+
if not cleaned:
|
| 12675 |
+
return
|
| 12676 |
+
if cleaned.lower() in {item.lower() for item in clauses}:
|
| 12677 |
+
return
|
| 12678 |
+
clauses.append(cleaned)
|
| 12679 |
+
|
| 12680 |
+
for match in re.finditer(
|
| 12681 |
+
r"\b\d+\s*x\s*(?:sma|rp-?sma)\s*(?:cellular|wi-?fi|gnss|gps)?(?:\s+connectors?)?\b",
|
| 12682 |
+
value,
|
| 12683 |
+
flags=re.IGNORECASE,
|
| 12684 |
+
):
|
| 12685 |
+
_remember(match.group(0))
|
| 12686 |
+
for match in re.finditer(
|
| 12687 |
+
r"\b(?:external\s+)?(?:cellular\s+sma connectors?|reverse-?sma wi-?fi connectors?|sma rf connectors?|wi-?fi variant uses rp-?sma)\b",
|
| 12688 |
+
value,
|
| 12689 |
+
flags=re.IGNORECASE,
|
| 12690 |
+
):
|
| 12691 |
+
_remember(match.group(0))
|
| 12692 |
+
for group in re.findall(r"\(([^)]*(?:sma|rp-?sma|gps|gnss)[^)]*)\)", value, flags=re.IGNORECASE):
|
| 12693 |
+
for part in re.split(r"[;,]", group):
|
| 12694 |
+
if any(token in part.lower() for token in ("typical", "adapter", "pigtail")):
|
| 12695 |
+
continue
|
| 12696 |
+
if re.search(r"\b(?:sma|rp-?sma|gps|gnss)\b", part, flags=re.IGNORECASE):
|
| 12697 |
+
_remember(part)
|
| 12698 |
+
if not clauses and any(token in low_value for token in ("sma", "rp-sma", "gps", "gnss", "connector")):
|
| 12699 |
+
truncated = re.split(r"[.;]\s*Adapter pigtails?:", value, maxsplit=1, flags=re.IGNORECASE)[0]
|
| 12700 |
+
truncated = re.sub(r"\bCellular:\s*4x4 MIMO on SMA\b", "", truncated, flags=re.IGNORECASE)
|
| 12701 |
+
truncated = re.sub(r"\bWi-?Fi(?:\s*\(if present\))?\s+on\s+RP-SMA\b", "", truncated, flags=re.IGNORECASE)
|
| 12702 |
+
truncated = re.sub(r"\bGNSS on SMA\b", "", truncated, flags=re.IGNORECASE)
|
| 12703 |
+
truncated = re.sub(r"\s+", " ", truncated).strip(" ;,.")
|
| 12704 |
+
if truncated:
|
| 12705 |
+
_remember(truncated)
|
| 12706 |
+
if not clauses:
|
| 12707 |
+
return "Not clearly documented"
|
| 12708 |
+
return _truncate("; ".join(clauses), 140)
|
| 12709 |
+
|
| 12710 |
def _vehicle_compare_reason(row: Dict[str, Any]) -> str:
|
| 12711 |
use_case = _norm(row.get("primary_use_case", ""))
|
| 12712 |
rugged = _norm(row.get("ruggedization", ""))
|
|
|
|
| 12882 |
sources: List[Dict[str, Any]] = []
|
| 12883 |
for idx, (display, row, antenna_family, why_fit) in enumerate(compare_rows, start=1):
|
| 12884 |
wan_lan = _norm(row.get("wan_lan", "")) or "Not listed"
|
| 12885 |
+
rf = _vehicle_compare_connector_summary(row)
|
| 12886 |
lines.append(
|
| 12887 |
f"| {_md_cell(display)} | {_md_cell(_norm(row.get('primary_use_case', '')) or 'Vehicle/mobile signal reviewed')} "
|
| 12888 |
f"| {_md_cell(wan_lan)} | {_md_cell(rf)} | {_md_cell(antenna_family)} | {_md_cell(why_fit)} |"
|
|
|
|
| 13029 |
modem = _norm(row.get("modem", "")) or "Not listed (abstained)"
|
| 13030 |
rugged = _norm(row.get("ruggedization", "")) or "Not listed (abstained)"
|
| 13031 |
battery = _norm(row.get("battery", "")) or "Not listed (abstained)"
|
| 13032 |
+
rf = _vehicle_compare_connector_summary(row)
|
| 13033 |
lines.append(
|
| 13034 |
f"| {idx} | {_md_cell(model_name)} | {_md_cell(modem)} | {_md_cell(rugged)} | "
|
| 13035 |
f"{_md_cell(battery)} | {_md_cell(rf)} | {_md_cell(note)} |"
|
|
|
|
| 29548 |
]
|
| 29549 |
ranking_rows: List[Tuple[int, int, str]] = []
|
| 29550 |
|
| 29551 |
+
def _fleet_router_name(item: Dict[str, Any], matched: Dict[str, Any]) -> str:
|
| 29552 |
+
requested_label = _norm(item.get("model_display") or item.get("product_text") or "")
|
| 29553 |
+
match = _as_dict(matched.get("match"))
|
| 29554 |
+
product_key = _norm(match.get("product_key") or "")
|
| 29555 |
+
if requested_label and product_key:
|
| 29556 |
+
if _compact_model(requested_label) == _compact_model(product_key):
|
| 29557 |
+
return requested_label
|
| 29558 |
+
return product_key
|
| 29559 |
+
return product_key or requested_label or _display_name(match) or "Unknown"
|
| 29560 |
+
|
| 29561 |
def _fleet_priority(item: Dict[str, Any], matched: Dict[str, Any]) -> Tuple[int, str]:
|
| 29562 |
qty = int(item.get("qty") or 0)
|
| 29563 |
+
router_name = _fleet_router_name(item, matched)
|
| 29564 |
if not matched:
|
| 29565 |
return 150 + min(qty, 25), (
|
| 29566 |
f"`{router_name}` needs an exact workbook model match before I can rank its migration path safely."
|
|
|
|
| 29602 |
reasons.append("authoritative lifecycle dates are incomplete")
|
| 29603 |
|
| 29604 |
reason_text = ", and ".join(reasons[:3])
|
| 29605 |
+
return score, f"`{router_name}` should be prioritized because {reason_text}."
|
| 29606 |
|
| 29607 |
asks_ranked_output = any(
|
| 29608 |
token in query.normalized_message
|
|
|
|
| 29622 |
status = _status_label(match, lifecycle)
|
| 29623 |
eos = lifecycle.get("end_of_sale_date") or "Not listed"
|
| 29624 |
eol = lifecycle.get("last_support_date") or "Not listed"
|
| 29625 |
+
router_name = _fleet_router_name(item, matched)
|
| 29626 |
else:
|
| 29627 |
same_brand = "Needs exact workbook match"
|
| 29628 |
backup = "Needs exact workbook match"
|
| 29629 |
status = "Needs exact workbook match"
|
| 29630 |
eos = "Not listed"
|
| 29631 |
eol = "Not listed"
|
| 29632 |
+
router_name = _fleet_router_name(item, matched)
|
| 29633 |
rows.append(
|
| 29634 |
"| "
|
| 29635 |
+ " | ".join(
|
|
|
|
| 32042 |
blocked["meta"] = blocked_meta
|
| 32043 |
return blocked
|
| 32044 |
lifecycle_policy_text = _scrub_router_model_tokens_for_policy(msg)
|
| 32045 |
+
if _EXACT_LIFECYCLE_RE.search(lifecycle_policy_text) and (not _is_supported_router_mixed_lifecycle_request(msg)):
|
| 32046 |
blocked = self._policy_block_response("exact_lifecycle", st)
|
| 32047 |
blocked_meta = _as_dict(blocked.get("meta"))
|
| 32048 |
blocked_meta["timing_ms"] = {"total": round((time.perf_counter() - t_total) * 1000.0, 2)}
|
backend/app/main.py
CHANGED
|
@@ -1706,16 +1706,19 @@ _RAPID_ROUTER_CATALOG_WORKBOOK_PATTERNS: Tuple[str, ...] = (
|
|
| 1706 |
|
| 1707 |
|
| 1708 |
def _resolve_rapid_router_catalog_workbook_path() -> Optional[Path]:
|
| 1709 |
-
|
| 1710 |
Path.cwd(),
|
| 1711 |
Path.cwd() / "backend",
|
|
|
|
| 1712 |
Path.home() / "Downloads",
|
| 1713 |
Path.home() / "Library" / "CloudStorage" / "Dropbox" / "Mac" / "Downloads",
|
| 1714 |
Path("/data"),
|
|
|
|
| 1715 |
Path("/tmp"),
|
|
|
|
| 1716 |
]
|
| 1717 |
seen: set[str] = set()
|
| 1718 |
-
for root in
|
| 1719 |
root_key = str(root)
|
| 1720 |
if root_key in seen or not root.exists() or not root.is_dir():
|
| 1721 |
continue
|
|
@@ -1724,6 +1727,26 @@ def _resolve_rapid_router_catalog_workbook_path() -> Optional[Path]:
|
|
| 1724 |
for candidate in sorted(root.glob(pattern)):
|
| 1725 |
if candidate.is_file():
|
| 1726 |
return candidate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1727 |
return None
|
| 1728 |
|
| 1729 |
|
|
|
|
| 1706 |
|
| 1707 |
|
| 1708 |
def _resolve_rapid_router_catalog_workbook_path() -> Optional[Path]:
|
| 1709 |
+
direct_search_roots: List[Path] = [
|
| 1710 |
Path.cwd(),
|
| 1711 |
Path.cwd() / "backend",
|
| 1712 |
+
Path.cwd() / "backend" / "data",
|
| 1713 |
Path.home() / "Downloads",
|
| 1714 |
Path.home() / "Library" / "CloudStorage" / "Dropbox" / "Mac" / "Downloads",
|
| 1715 |
Path("/data"),
|
| 1716 |
+
Path("/data/rapid_router"),
|
| 1717 |
Path("/tmp"),
|
| 1718 |
+
_resolve_rapid_router_storage_dir(),
|
| 1719 |
]
|
| 1720 |
seen: set[str] = set()
|
| 1721 |
+
for root in direct_search_roots:
|
| 1722 |
root_key = str(root)
|
| 1723 |
if root_key in seen or not root.exists() or not root.is_dir():
|
| 1724 |
continue
|
|
|
|
| 1727 |
for candidate in sorted(root.glob(pattern)):
|
| 1728 |
if candidate.is_file():
|
| 1729 |
return candidate
|
| 1730 |
+
recursive_search_roots: List[Path] = [
|
| 1731 |
+
Path.cwd() / "backend" / "data",
|
| 1732 |
+
Path("/data"),
|
| 1733 |
+
Path("/data/rapid_router"),
|
| 1734 |
+
_resolve_rapid_router_storage_dir(),
|
| 1735 |
+
Path("/tmp"),
|
| 1736 |
+
]
|
| 1737 |
+
for root in recursive_search_roots:
|
| 1738 |
+
root_key = f"recursive:{root}"
|
| 1739 |
+
if root_key in seen or not root.exists() or not root.is_dir():
|
| 1740 |
+
continue
|
| 1741 |
+
seen.add(root_key)
|
| 1742 |
+
for pattern in _RAPID_ROUTER_CATALOG_WORKBOOK_PATTERNS:
|
| 1743 |
+
for candidate in sorted(root.rglob(pattern)):
|
| 1744 |
+
try:
|
| 1745 |
+
depth = len(candidate.relative_to(root).parts)
|
| 1746 |
+
except Exception:
|
| 1747 |
+
depth = 999
|
| 1748 |
+
if candidate.is_file() and depth <= 4:
|
| 1749 |
+
return candidate
|
| 1750 |
return None
|
| 1751 |
|
| 1752 |
|
backend/app/test_rapid_router_catalog_bootstrap.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import app.main as main
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_resolve_rapid_router_catalog_workbook_path_finds_nested_backend_data_workbook(
|
| 9 |
+
tmp_path: Path, monkeypatch
|
| 10 |
+
) -> None:
|
| 11 |
+
nested = tmp_path / "backend" / "data" / "rapid_router" / "imports"
|
| 12 |
+
nested.mkdir(parents=True)
|
| 13 |
+
workbook = nested / "device_master_source_of_truth_v26_site_survey_integrated_export.xlsx"
|
| 14 |
+
workbook.write_bytes(b"placeholder")
|
| 15 |
+
fake_home = tmp_path / "fake_home"
|
| 16 |
+
(fake_home / "Downloads").mkdir(parents=True)
|
| 17 |
+
|
| 18 |
+
monkeypatch.chdir(tmp_path)
|
| 19 |
+
monkeypatch.setattr(main, "_resolve_rapid_router_storage_dir", lambda: tmp_path / "storage")
|
| 20 |
+
monkeypatch.setattr(main.Path, "home", classmethod(lambda cls: fake_home))
|
| 21 |
+
|
| 22 |
+
resolved = main._resolve_rapid_router_catalog_workbook_path()
|
| 23 |
+
|
| 24 |
+
assert resolved == workbook
|
backend/app/test_score_router_canary_ab_responses.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import importlib.util
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def _load_score_module():
|
| 8 |
+
script_path = Path(__file__).resolve().parents[1] / "scripts" / "score_router_canary_ab_responses.py"
|
| 9 |
+
spec = importlib.util.spec_from_file_location("score_router_canary_ab_responses", script_path)
|
| 10 |
+
assert spec is not None and spec.loader is not None
|
| 11 |
+
module = importlib.util.module_from_spec(spec)
|
| 12 |
+
spec.loader.exec_module(module)
|
| 13 |
+
return module
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_normalize_grade_payload_scales_fractional_scores_to_percent() -> None:
|
| 17 |
+
module = _load_score_module()
|
| 18 |
+
|
| 19 |
+
out = module._normalize_grade_payload(
|
| 20 |
+
{
|
| 21 |
+
"fact_score": 0.64,
|
| 22 |
+
"instruction_score": 0.81,
|
| 23 |
+
"coverage_score": 0.52,
|
| 24 |
+
"readability_score": 0.9,
|
| 25 |
+
"safety_score": 1.0,
|
| 26 |
+
"overall_score": 0.74,
|
| 27 |
+
"issues": ["ambiguous_specs"],
|
| 28 |
+
"rationale": "fractional payload",
|
| 29 |
+
}
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
assert out["fact_score"] == 64
|
| 33 |
+
assert out["instruction_score"] == 81
|
| 34 |
+
assert out["coverage_score"] == 52
|
| 35 |
+
assert out["readability_score"] == 90
|
| 36 |
+
assert out["safety_score"] == 100
|
| 37 |
+
assert out["overall_score"] == 74
|
| 38 |
+
assert out["grade"] == "C"
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_normalize_grade_payload_scales_ten_point_scores_to_percent() -> None:
|
| 42 |
+
module = _load_score_module()
|
| 43 |
+
|
| 44 |
+
out = module._normalize_grade_payload(
|
| 45 |
+
{
|
| 46 |
+
"fact_score": 7,
|
| 47 |
+
"instruction_score": 8.5,
|
| 48 |
+
"coverage_score": 6,
|
| 49 |
+
"readability_score": 9,
|
| 50 |
+
"safety_score": 10,
|
| 51 |
+
}
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
assert out["fact_score"] == 70
|
| 55 |
+
assert out["instruction_score"] == 85
|
| 56 |
+
assert out["coverage_score"] == 60
|
| 57 |
+
assert out["readability_score"] == 90
|
| 58 |
+
assert out["safety_score"] == 100
|
| 59 |
+
assert out["overall_score"] == 81
|
| 60 |
+
assert out["grade"] == "B"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def test_normalize_grade_payload_preserves_percent_scale_and_percent_strings() -> None:
|
| 64 |
+
module = _load_score_module()
|
| 65 |
+
|
| 66 |
+
out = module._normalize_grade_payload(
|
| 67 |
+
{
|
| 68 |
+
"fact_score": "64%",
|
| 69 |
+
"instruction_score": "81",
|
| 70 |
+
"coverage_score": 52,
|
| 71 |
+
"readability_score": 90,
|
| 72 |
+
"safety_score": "100%",
|
| 73 |
+
"overall_score": 74,
|
| 74 |
+
}
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
assert out["fact_score"] == 64
|
| 78 |
+
assert out["instruction_score"] == 81
|
| 79 |
+
assert out["coverage_score"] == 52
|
| 80 |
+
assert out["readability_score"] == 90
|
| 81 |
+
assert out["safety_score"] == 100
|
| 82 |
+
assert out["overall_score"] == 74
|
backend/app/test_unified_kb_core.py
CHANGED
|
@@ -4113,6 +4113,24 @@ def test_unified_kb_router_install_caveat_compare_defers_to_router_docs_delegate
|
|
| 4113 |
assert "Router docs answer." in str(out.get("assistant") or "")
|
| 4114 |
|
| 4115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4116 |
def test_unified_kb_router_weight_compare_skips_alias_clarification() -> None:
|
| 4117 |
core = build_core()
|
| 4118 |
out = core.handle_message(
|
|
@@ -4239,6 +4257,21 @@ def test_unified_kb_docs_only_three_model_compare_defers_to_router_docs() -> Non
|
|
| 4239 |
assert "Lifecycle note:" not in assistant
|
| 4240 |
|
| 4241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4242 |
def test_unified_kb_docs_only_alias_pair_compare_keeps_side_by_side_fast() -> None:
|
| 4243 |
core = build_core_with(router_core=RepoCsvRouterCore())
|
| 4244 |
out = core.handle_message(
|
|
@@ -4405,6 +4438,54 @@ def test_unified_kb_router_decision_table_compare_defers_to_router_docs() -> Non
|
|
| 4405 |
assert "| MAXBR1PRO |" not in assistant
|
| 4406 |
|
| 4407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4408 |
def test_unified_kb_router_decision_table_does_not_default_husky_when_family_is_missing(tmp_path: Path) -> None:
|
| 4409 |
dec_path = tmp_path / "feb2026routers.csv"
|
| 4410 |
dec_path.write_text(
|
|
@@ -4454,6 +4535,20 @@ def test_unified_kb_br1_connector_compare_uses_deterministic_compare_lane() -> N
|
|
| 4454 |
assert "Lifecycle note:" not in assistant
|
| 4455 |
|
| 4456 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4457 |
def test_unified_kb_lifecycle_prompt_does_not_invent_given300_model() -> None:
|
| 4458 |
core = build_core()
|
| 4459 |
out = core.handle_message(
|
|
|
|
| 4113 |
assert "Router docs answer." in str(out.get("assistant") or "")
|
| 4114 |
|
| 4115 |
|
| 4116 |
+
def test_unified_kb_docs_only_install_compare_keeps_variant_family_rows_conservative() -> None:
|
| 4117 |
+
core = build_core_with(router_core=RepoCsvRouterCore())
|
| 4118 |
+
out = core.handle_message(
|
| 4119 |
+
"For AER2200 and AER1600, summarize WAN/LAN, RF connectors, modem variants, and install caveats in one table from docs only.",
|
| 4120 |
+
{},
|
| 4121 |
+
mode="auto",
|
| 4122 |
+
audience="auto",
|
| 4123 |
+
show_citations=True,
|
| 4124 |
+
)
|
| 4125 |
+
assert out["meta"]["domain"] == "router_docs"
|
| 4126 |
+
assert out["meta"]["retrieval_mode"] == "router_multi_model_doc_caveat_table_fast"
|
| 4127 |
+
assistant = str(out.get("assistant") or "")
|
| 4128 |
+
assert "Needs exact SKU/package; connector path varies across documented family variants." in assistant
|
| 4129 |
+
assert "Needs exact SKU/package; modem path varies across documented family variants." in assistant
|
| 4130 |
+
assert "Cat 4 / Cat 6 / LTE Advanced Pro bundles" not in assistant
|
| 4131 |
+
assert "external reverse-SMA Wi-Fi connectors" not in assistant
|
| 4132 |
+
|
| 4133 |
+
|
| 4134 |
def test_unified_kb_router_weight_compare_skips_alias_clarification() -> None:
|
| 4135 |
core = build_core()
|
| 4136 |
out = core.handle_message(
|
|
|
|
| 4257 |
assert "Lifecycle note:" not in assistant
|
| 4258 |
|
| 4259 |
|
| 4260 |
+
def test_unified_kb_docs_only_mg_eval_prompt_stays_documented_and_conservative() -> None:
|
| 4261 |
+
core = build_core_with(router_core=RepoCsvRouterCore(), router_rag_core=StubRouterRagCompareDocs())
|
| 4262 |
+
out = core._router_multi_model_doc_table_fast(
|
| 4263 |
+
"Compare MG51 vs MG52 vs MG52E and show only meaningful differences, including what is documented vs not documented."
|
| 4264 |
+
)
|
| 4265 |
+
assert out is not None
|
| 4266 |
+
assert out["meta"]["domain"] == "router_docs"
|
| 4267 |
+
assert out["meta"]["retrieval_mode"] == "router_docs_documented_matrix_fast"
|
| 4268 |
+
assistant = str(out.get("assistant") or "")
|
| 4269 |
+
assert "Documented vs not-documented comparison" in assistant
|
| 4270 |
+
assert "Not documented" in assistant
|
| 4271 |
+
assert "Adapter pigtails" not in assistant
|
| 4272 |
+
assert "Lifecycle note:" not in assistant
|
| 4273 |
+
|
| 4274 |
+
|
| 4275 |
def test_unified_kb_docs_only_alias_pair_compare_keeps_side_by_side_fast() -> None:
|
| 4276 |
core = build_core_with(router_core=RepoCsvRouterCore())
|
| 4277 |
out = core.handle_message(
|
|
|
|
| 4438 |
assert "| MAXBR1PRO |" not in assistant
|
| 4439 |
|
| 4440 |
|
| 4441 |
+
def test_unified_kb_router_decision_table_keeps_rf_guidance_conservative() -> None:
|
| 4442 |
+
core = build_core_with(router_core=RepoCsvRouterCore())
|
| 4443 |
+
out = core.handle_message(
|
| 4444 |
+
"Build a comparison table for XR60, R980, and MAX BR1 Pro 5G for police vehicles, including RF connectors, vehicle fit, and conservative antenna-family guidance.",
|
| 4445 |
+
{},
|
| 4446 |
+
mode="auto",
|
| 4447 |
+
audience="auto",
|
| 4448 |
+
show_citations=True,
|
| 4449 |
+
)
|
| 4450 |
+
assert out["meta"]["retrieval_mode"] == "router_vehicle_5g_recommendation_fast"
|
| 4451 |
+
assistant = str(out.get("assistant") or "")
|
| 4452 |
+
assert "4x4 MIMO on SMA" not in assistant
|
| 4453 |
+
assert "Adapter pigtails" not in assistant
|
| 4454 |
+
assert "4x SMA cellular" in assistant
|
| 4455 |
+
assert "2x RP-SMA Wi-Fi" in assistant
|
| 4456 |
+
|
| 4457 |
+
|
| 4458 |
+
def test_unified_kb_aer_docs_only_eval_prompt_stays_variant_conservative() -> None:
|
| 4459 |
+
core = build_core_with(router_core=RepoCsvRouterCore(), router_rag_core=StubRouterRagCompareDocs())
|
| 4460 |
+
out = core._router_multi_model_doc_table_fast(
|
| 4461 |
+
"Compare AER1600 vs AER2200 from docs only and separate clearly documented from not documented."
|
| 4462 |
+
)
|
| 4463 |
+
assert out is not None
|
| 4464 |
+
assert out["meta"]["domain"] == "router_docs"
|
| 4465 |
+
assert out["meta"]["retrieval_mode"] == "router_docs_documented_matrix_fast"
|
| 4466 |
+
assistant = str(out.get("assistant") or "")
|
| 4467 |
+
assert "Needs exact SKU/package; connector path varies across documented family variants." in assistant
|
| 4468 |
+
assert "Needs exact SKU/package; modem path varies across documented family variants." in assistant
|
| 4469 |
+
assert "external reverse-SMA Wi-Fi connectors" not in assistant
|
| 4470 |
+
|
| 4471 |
+
|
| 4472 |
+
def test_unified_kb_vehicle_eval_prompt_avoids_speculative_rf_language() -> None:
|
| 4473 |
+
core = build_core_with(router_core=RepoCsvRouterCore())
|
| 4474 |
+
out = core.handle_message(
|
| 4475 |
+
"Build a decision table comparing XR60, R980, and MAX BR1 Pro 5G for police vehicles with recommended antenna families.",
|
| 4476 |
+
{},
|
| 4477 |
+
mode="auto",
|
| 4478 |
+
audience="auto",
|
| 4479 |
+
show_citations=True,
|
| 4480 |
+
)
|
| 4481 |
+
assert out["meta"]["domain"] == "router_docs"
|
| 4482 |
+
assert out["meta"]["retrieval_mode"] == "router_vehicle_5g_recommendation_fast"
|
| 4483 |
+
assistant = str(out.get("assistant") or "")
|
| 4484 |
+
assert "4x4 MIMO on SMA" not in assistant
|
| 4485 |
+
assert "Adapter pigtails" not in assistant
|
| 4486 |
+
assert ("Needs connector validation" in assistant) or ("Husky" in assistant) or ("Listed, but family not explicit" in assistant)
|
| 4487 |
+
|
| 4488 |
+
|
| 4489 |
def test_unified_kb_router_decision_table_does_not_default_husky_when_family_is_missing(tmp_path: Path) -> None:
|
| 4490 |
dec_path = tmp_path / "feb2026routers.csv"
|
| 4491 |
dec_path.write_text(
|
|
|
|
| 4535 |
assert "Lifecycle note:" not in assistant
|
| 4536 |
|
| 4537 |
|
| 4538 |
+
def test_unified_kb_br1_docs_only_eval_prompt_avoids_sparse_alias_rows() -> None:
|
| 4539 |
+
core = build_core_with(router_core=RepoCsvRouterCore(), router_rag_core=StubRouterRagCompareDocs())
|
| 4540 |
+
out = core._router_multi_model_doc_table_fast(
|
| 4541 |
+
"Compare MAX BR1 Pro 5G vs MAX BR1 Mini 5G from documented specs only, in table format."
|
| 4542 |
+
)
|
| 4543 |
+
assert out is not None
|
| 4544 |
+
assert out["meta"]["domain"] == "router_docs"
|
| 4545 |
+
assert out["meta"]["retrieval_mode"] == "router_multi_model_doc_table_fast"
|
| 4546 |
+
assistant = str(out.get("assistant") or "")
|
| 4547 |
+
assert "| Internal documented source | feb2026routers.csv | feb2026routers.csv |" in assistant
|
| 4548 |
+
assert "router_pricing_catalog_normalized.csv" not in assistant
|
| 4549 |
+
assert "Adapter pigtails" not in assistant
|
| 4550 |
+
|
| 4551 |
+
|
| 4552 |
def test_unified_kb_lifecycle_prompt_does_not_invent_given300_model() -> None:
|
| 4553 |
core = build_core()
|
| 4554 |
out = core.handle_message(
|
backend/app/test_unified_kb_router_workbook.py
CHANGED
|
@@ -94,8 +94,12 @@ def test_unified_kb_router_docs_spec_table_defers_to_documented_sources(tmp_path
|
|
| 94 |
)
|
| 95 |
|
| 96 |
assert out["meta"]["domain"] == "router_docs"
|
|
|
|
| 97 |
assert not str(out["meta"]["retrieval_mode"]).startswith("deterministic_router_workbook_")
|
| 98 |
assert out["meta"].get("router_intelligence_source") != "workbook"
|
|
|
|
|
|
|
|
|
|
| 99 |
assert router_core.calls == 0
|
| 100 |
|
| 101 |
|
|
@@ -163,6 +167,66 @@ def test_unified_kb_router_docs_details_handles_typo_with_workbook(tmp_path: Pat
|
|
| 163 |
assert router_core.calls == 0
|
| 164 |
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
def test_unified_kb_router_docs_compare_paraphrase_uses_gpt_orchestration(tmp_path: Path, monkeypatch) -> None:
|
| 167 |
eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
|
| 168 |
router_rag = StubRouterRag()
|
|
@@ -525,6 +589,33 @@ def test_unified_kb_router_lifecycle_fleet_snapshot_prefers_workbook(tmp_path: P
|
|
| 525 |
assert router_core.calls == 0
|
| 526 |
|
| 527 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
def test_unified_kb_router_inventory_import_returns_row_confidence_and_alias_corrections(tmp_path: Path) -> None:
|
| 529 |
workbook_core = _loaded_workbook_core(tmp_path)
|
| 530 |
core = build_core_with(
|
|
|
|
| 94 |
)
|
| 95 |
|
| 96 |
assert out["meta"]["domain"] == "router_docs"
|
| 97 |
+
assert out["meta"]["retrieval_mode"] == "router_multi_model_doc_caveat_table_fast"
|
| 98 |
assert not str(out["meta"]["retrieval_mode"]).startswith("deterministic_router_workbook_")
|
| 99 |
assert out["meta"].get("router_intelligence_source") != "workbook"
|
| 100 |
+
assert "Documented multi-model compare table (internal docs only):" in out["assistant"]
|
| 101 |
+
assert "| Model | WAN/LAN ports | RF connectors | Modem/cellular | Install caveats | Evidence |" in out["assistant"]
|
| 102 |
+
assert "I need stronger internal citations" not in out["assistant"]
|
| 103 |
assert router_core.calls == 0
|
| 104 |
|
| 105 |
|
|
|
|
| 167 |
assert router_core.calls == 0
|
| 168 |
|
| 169 |
|
| 170 |
+
def test_unified_kb_router_docs_mixed_lifecycle_details_prefers_workbook_over_policy_block(tmp_path: Path) -> None:
|
| 171 |
+
eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
|
| 172 |
+
router_rag = StubRouterRag()
|
| 173 |
+
router_core = CountingRouterCore(eos_csv_path=eos_csv, dec_csv_path=dec_csv)
|
| 174 |
+
workbook_core = _loaded_workbook_core(tmp_path)
|
| 175 |
+
core = build_core_with(
|
| 176 |
+
router_rag_core=router_rag,
|
| 177 |
+
router_core=router_core,
|
| 178 |
+
masters_core=StubMastersCore(),
|
| 179 |
+
pots_core=StubPotsCore(),
|
| 180 |
+
rapid_router_intelligence_provider=lambda: workbook_core,
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
out = core.handle_message(
|
| 184 |
+
"Give me the details on Current-500, including lifecycle status, primary use case, WAN/LAN, Wi-Fi, and whether it is still a current recommendation.",
|
| 185 |
+
{},
|
| 186 |
+
mode="auto",
|
| 187 |
+
audience="auto",
|
| 188 |
+
show_citations=True,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
assert out["meta"]["domain"] == "router_docs"
|
| 192 |
+
assert out["meta"]["retrieval_mode"] == "deterministic_router_workbook_details"
|
| 193 |
+
assert out["meta"]["router_intelligence_source"] == "workbook"
|
| 194 |
+
assert out["meta"].get("reason") is None
|
| 195 |
+
assert "Current-500" in str(out.get("assistant") or "")
|
| 196 |
+
assert router_rag.calls == 0
|
| 197 |
+
assert router_core.calls == 0
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def test_unified_kb_router_docs_mixed_lifecycle_compare_prefers_workbook_over_policy_block(tmp_path: Path) -> None:
|
| 201 |
+
eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
|
| 202 |
+
router_rag = StubRouterRag()
|
| 203 |
+
router_core = CountingRouterCore(eos_csv_path=eos_csv, dec_csv_path=dec_csv)
|
| 204 |
+
workbook_core = _loaded_workbook_core(tmp_path)
|
| 205 |
+
core = build_core_with(
|
| 206 |
+
router_rag_core=router_rag,
|
| 207 |
+
router_core=router_core,
|
| 208 |
+
masters_core=StubMastersCore(),
|
| 209 |
+
pots_core=StubPotsCore(),
|
| 210 |
+
rapid_router_intelligence_provider=lambda: workbook_core,
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
out = core.handle_message(
|
| 214 |
+
"Compare Legacy-100 and Current-500, including lifecycle status, WAN/LAN, Wi-Fi, and whether either is still a current recommendation.",
|
| 215 |
+
{},
|
| 216 |
+
mode="auto",
|
| 217 |
+
audience="auto",
|
| 218 |
+
show_citations=True,
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
assert out["meta"]["domain"] == "router_docs"
|
| 222 |
+
assert out["meta"]["retrieval_mode"] == "deterministic_router_workbook_compare"
|
| 223 |
+
assert out["meta"]["router_intelligence_source"] == "workbook"
|
| 224 |
+
assert out["meta"].get("reason") is None
|
| 225 |
+
assert "Workbook-backed router comparison" in str(out.get("assistant") or "")
|
| 226 |
+
assert router_rag.calls == 0
|
| 227 |
+
assert router_core.calls == 0
|
| 228 |
+
|
| 229 |
+
|
| 230 |
def test_unified_kb_router_docs_compare_paraphrase_uses_gpt_orchestration(tmp_path: Path, monkeypatch) -> None:
|
| 231 |
eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
|
| 232 |
router_rag = StubRouterRag()
|
|
|
|
| 589 |
assert router_core.calls == 0
|
| 590 |
|
| 591 |
|
| 592 |
+
def test_unified_kb_router_lifecycle_fleet_snapshot_keeps_requested_model_labels(tmp_path: Path) -> None:
|
| 593 |
+
eos_csv, dec_csv = _write_router_regression_csvs(tmp_path)
|
| 594 |
+
workbook_core = _loaded_workbook_core(tmp_path)
|
| 595 |
+
router_core = CountingRouterCore(eos_csv_path=eos_csv, dec_csv_path=dec_csv)
|
| 596 |
+
core = build_core_with(
|
| 597 |
+
router_rag_core=StubRouterRag(),
|
| 598 |
+
router_core=router_core,
|
| 599 |
+
masters_core=StubMastersCore(),
|
| 600 |
+
pots_core=StubPotsCore(),
|
| 601 |
+
rapid_router_intelligence_provider=lambda: workbook_core,
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
out = core.handle_message(
|
| 605 |
+
"Customer portfolio: 12 Legacy-100, 3 Legacy-NR. Build phased 5G replacement strategy with table.",
|
| 606 |
+
{},
|
| 607 |
+
mode="router_lifecycle",
|
| 608 |
+
audience="auto",
|
| 609 |
+
show_citations=True,
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
assert out["meta"]["retrieval_mode"] == "deterministic_router_workbook_fleet_lifecycle"
|
| 613 |
+
assert "| Unknown | prod_test_legacy | 12 |" in out["assistant"]
|
| 614 |
+
assert "| Unknown | prod_test_no_replacement | 3 |" in out["assistant"]
|
| 615 |
+
assert "Test Corp legacy 4G router" not in out["assistant"]
|
| 616 |
+
assert router_core.calls == 0
|
| 617 |
+
|
| 618 |
+
|
| 619 |
def test_unified_kb_router_inventory_import_returns_row_confidence_and_alias_corrections(tmp_path: Path) -> None:
|
| 620 |
workbook_core = _loaded_workbook_core(tmp_path)
|
| 621 |
core = build_core_with(
|
backend/app/test_validate_hosted_runtime.py
CHANGED
|
@@ -15,13 +15,14 @@ def _load_validate_hosted_runtime_module():
|
|
| 15 |
return module
|
| 16 |
|
| 17 |
|
| 18 |
-
def _args(*, expect_auth_required: bool = True):
|
| 19 |
return argparse.Namespace(
|
| 20 |
base_url="https://example.hf.space",
|
| 21 |
expected_build_version="release-123",
|
| 22 |
expected_git_sha="abc123",
|
| 23 |
expect_auth_required=expect_auth_required,
|
| 24 |
expect_auth_enabled=True,
|
|
|
|
| 25 |
timeout_s=20.0,
|
| 26 |
out="",
|
| 27 |
)
|
|
@@ -67,6 +68,128 @@ def test_build_report_accepts_protected_health_when_auth_required(monkeypatch) -
|
|
| 67 |
assert any("/api/health returned HTTP 401" in warning for warning in report["warnings"])
|
| 68 |
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def test_build_report_rejects_protected_health_when_auth_not_required(monkeypatch) -> None:
|
| 71 |
module = _load_validate_hosted_runtime_module()
|
| 72 |
|
|
|
|
| 15 |
return module
|
| 16 |
|
| 17 |
|
| 18 |
+
def _args(*, expect_auth_required: bool = True, require_router_workbook_loaded: bool = False):
|
| 19 |
return argparse.Namespace(
|
| 20 |
base_url="https://example.hf.space",
|
| 21 |
expected_build_version="release-123",
|
| 22 |
expected_git_sha="abc123",
|
| 23 |
expect_auth_required=expect_auth_required,
|
| 24 |
expect_auth_enabled=True,
|
| 25 |
+
require_router_workbook_loaded=require_router_workbook_loaded,
|
| 26 |
timeout_s=20.0,
|
| 27 |
out="",
|
| 28 |
)
|
|
|
|
| 68 |
assert any("/api/health returned HTTP 401" in warning for warning in report["warnings"])
|
| 69 |
|
| 70 |
|
| 71 |
+
def test_build_report_requires_router_workbook_loaded(monkeypatch) -> None:
|
| 72 |
+
module = _load_validate_hosted_runtime_module()
|
| 73 |
+
|
| 74 |
+
def fake_fetch_json(base_url: str, path: str, timeout_s: float):
|
| 75 |
+
if path == "/build-info":
|
| 76 |
+
return {
|
| 77 |
+
"build_version": "release-123",
|
| 78 |
+
"git_sha": "abc123",
|
| 79 |
+
"startup_integrity_ok": True,
|
| 80 |
+
"auth_required": True,
|
| 81 |
+
"auth_enabled": True,
|
| 82 |
+
"auth_config_error": "",
|
| 83 |
+
"auth_config_details": [],
|
| 84 |
+
"auth_config_warnings": [],
|
| 85 |
+
"app_base_url": "https://example.hf.space",
|
| 86 |
+
"vite_app_base_url": "https://example.hf.space",
|
| 87 |
+
}
|
| 88 |
+
if path == "/api/health":
|
| 89 |
+
raise HTTPError(
|
| 90 |
+
url=f"{base_url.rstrip('/')}/api/health",
|
| 91 |
+
code=401,
|
| 92 |
+
msg="Unauthorized",
|
| 93 |
+
hdrs=None,
|
| 94 |
+
fp=None,
|
| 95 |
+
)
|
| 96 |
+
if path == "/api/rapid_router/catalog/status":
|
| 97 |
+
return {"ok": True, "catalog": {"loaded": True, "product_count": 42}}
|
| 98 |
+
raise AssertionError(f"Unexpected path: {path}")
|
| 99 |
+
|
| 100 |
+
monkeypatch.setattr(module, "_fetch_json", fake_fetch_json)
|
| 101 |
+
|
| 102 |
+
report = module._build_report(_args(expect_auth_required=True, require_router_workbook_loaded=True))
|
| 103 |
+
|
| 104 |
+
assert report["ok"] is True
|
| 105 |
+
assert report["checks"]["router_catalog_loaded"] is True
|
| 106 |
+
assert report["checks"]["router_catalog_product_count"] == 42
|
| 107 |
+
assert report["checks"]["router_catalog_access"] == "ok"
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def test_build_report_rejects_unloaded_router_workbook(monkeypatch) -> None:
|
| 111 |
+
module = _load_validate_hosted_runtime_module()
|
| 112 |
+
|
| 113 |
+
def fake_fetch_json(base_url: str, path: str, timeout_s: float):
|
| 114 |
+
if path == "/build-info":
|
| 115 |
+
return {
|
| 116 |
+
"build_version": "release-123",
|
| 117 |
+
"git_sha": "abc123",
|
| 118 |
+
"startup_integrity_ok": True,
|
| 119 |
+
"auth_required": True,
|
| 120 |
+
"auth_enabled": True,
|
| 121 |
+
"auth_config_error": "",
|
| 122 |
+
"auth_config_details": [],
|
| 123 |
+
"auth_config_warnings": [],
|
| 124 |
+
"app_base_url": "https://example.hf.space",
|
| 125 |
+
"vite_app_base_url": "https://example.hf.space",
|
| 126 |
+
}
|
| 127 |
+
if path == "/api/health":
|
| 128 |
+
raise HTTPError(
|
| 129 |
+
url=f"{base_url.rstrip('/')}/api/health",
|
| 130 |
+
code=401,
|
| 131 |
+
msg="Unauthorized",
|
| 132 |
+
hdrs=None,
|
| 133 |
+
fp=None,
|
| 134 |
+
)
|
| 135 |
+
if path == "/api/rapid_router/catalog/status":
|
| 136 |
+
return {"ok": True, "catalog": {"loaded": False, "product_count": 0}}
|
| 137 |
+
raise AssertionError(f"Unexpected path: {path}")
|
| 138 |
+
|
| 139 |
+
monkeypatch.setattr(module, "_fetch_json", fake_fetch_json)
|
| 140 |
+
|
| 141 |
+
report = module._build_report(_args(expect_auth_required=True, require_router_workbook_loaded=True))
|
| 142 |
+
|
| 143 |
+
assert report["ok"] is False
|
| 144 |
+
assert any("router workbook catalog is not loaded" in item.lower() for item in report["failures"])
|
| 145 |
+
assert report["checks"]["router_catalog_loaded"] is False
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def test_build_report_rejects_unreadable_router_workbook_status(monkeypatch) -> None:
|
| 149 |
+
module = _load_validate_hosted_runtime_module()
|
| 150 |
+
|
| 151 |
+
def fake_fetch_json(base_url: str, path: str, timeout_s: float):
|
| 152 |
+
if path == "/build-info":
|
| 153 |
+
return {
|
| 154 |
+
"build_version": "release-123",
|
| 155 |
+
"git_sha": "abc123",
|
| 156 |
+
"startup_integrity_ok": True,
|
| 157 |
+
"auth_required": True,
|
| 158 |
+
"auth_enabled": True,
|
| 159 |
+
"auth_config_error": "",
|
| 160 |
+
"auth_config_details": [],
|
| 161 |
+
"auth_config_warnings": [],
|
| 162 |
+
"app_base_url": "https://example.hf.space",
|
| 163 |
+
"vite_app_base_url": "https://example.hf.space",
|
| 164 |
+
}
|
| 165 |
+
if path == "/api/health":
|
| 166 |
+
raise HTTPError(
|
| 167 |
+
url=f"{base_url.rstrip('/')}/api/health",
|
| 168 |
+
code=401,
|
| 169 |
+
msg="Unauthorized",
|
| 170 |
+
hdrs=None,
|
| 171 |
+
fp=None,
|
| 172 |
+
)
|
| 173 |
+
if path == "/api/rapid_router/catalog/status":
|
| 174 |
+
raise HTTPError(
|
| 175 |
+
url=f"{base_url.rstrip('/')}/api/rapid_router/catalog/status",
|
| 176 |
+
code=403,
|
| 177 |
+
msg="Forbidden",
|
| 178 |
+
hdrs=None,
|
| 179 |
+
fp=None,
|
| 180 |
+
)
|
| 181 |
+
raise AssertionError(f"Unexpected path: {path}")
|
| 182 |
+
|
| 183 |
+
monkeypatch.setattr(module, "_fetch_json", fake_fetch_json)
|
| 184 |
+
|
| 185 |
+
report = module._build_report(_args(expect_auth_required=True, require_router_workbook_loaded=True))
|
| 186 |
+
|
| 187 |
+
assert report["ok"] is False
|
| 188 |
+
assert any("/api/rapid_router/catalog/status could not be validated" in item for item in report["failures"])
|
| 189 |
+
assert report["checks"]["router_catalog_access"] == "protected"
|
| 190 |
+
assert report["checks"]["router_catalog_status_code"] == 403
|
| 191 |
+
|
| 192 |
+
|
| 193 |
def test_build_report_rejects_protected_health_when_auth_not_required(monkeypatch) -> None:
|
| 194 |
module = _load_validate_hosted_runtime_module()
|
| 195 |
|
backend/scripts/run_router_canary_ab_eval_shard.py
ADDED
|
@@ -0,0 +1,657 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import csv
|
| 6 |
+
import importlib.util
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
import subprocess
|
| 11 |
+
import sys
|
| 12 |
+
import time
|
| 13 |
+
from copy import deepcopy
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
| 16 |
+
from urllib.error import HTTPError, URLError
|
| 17 |
+
|
| 18 |
+
import certifi # type: ignore
|
| 19 |
+
import requests
|
| 20 |
+
from playwright.sync_api import BrowserContext, Page, TimeoutError as PlaywrightTimeoutError, sync_playwright
|
| 21 |
+
|
| 22 |
+
REPO_ROOT = Path(__file__).resolve().parents[2]
|
| 23 |
+
FRONTEND_E2E_ENV = REPO_ROOT / "frontend" / ".env.e2e"
|
| 24 |
+
VALIDATOR_SCRIPT = REPO_ROOT / "backend" / "scripts" / "validate_hosted_runtime.py"
|
| 25 |
+
SCORER_SCRIPT = REPO_ROOT / "backend" / "scripts" / "score_router_canary_ab_responses.py"
|
| 26 |
+
SURVEY_SCOPE = "knowledgebase"
|
| 27 |
+
DEFAULT_TIMEOUT_S = 30.0
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _load_python_module(path: Path, module_name: str) -> Any:
|
| 31 |
+
spec = importlib.util.spec_from_file_location(module_name, path)
|
| 32 |
+
if spec is None or spec.loader is None:
|
| 33 |
+
raise RuntimeError(f"Could not load module from {path}")
|
| 34 |
+
module = importlib.util.module_from_spec(spec)
|
| 35 |
+
spec.loader.exec_module(module)
|
| 36 |
+
return module
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _validator_module() -> Any:
|
| 40 |
+
return _load_python_module(VALIDATOR_SCRIPT, "validate_hosted_runtime")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _survey_fixture_bytes() -> bytes:
|
| 44 |
+
backend_dir = REPO_ROOT / "backend"
|
| 45 |
+
if str(backend_dir) not in sys.path:
|
| 46 |
+
sys.path.insert(0, str(backend_dir))
|
| 47 |
+
from app.rapid_router.test_catalog_db import _survey_workbook_bytes # type: ignore
|
| 48 |
+
|
| 49 |
+
return _survey_workbook_bytes()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _read_env_file(path: Path) -> Dict[str, str]:
|
| 53 |
+
data: Dict[str, str] = {}
|
| 54 |
+
if not path.exists():
|
| 55 |
+
return data
|
| 56 |
+
for raw_line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
| 57 |
+
line = raw_line.strip()
|
| 58 |
+
if (not line) or line.startswith("#") or ("=" not in line):
|
| 59 |
+
continue
|
| 60 |
+
key, value = line.split("=", 1)
|
| 61 |
+
data[key.strip()] = value.strip().strip("'").strip('"')
|
| 62 |
+
return data
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _auth_settings(env_file: Path) -> Dict[str, str]:
|
| 66 |
+
env_map = _read_env_file(env_file)
|
| 67 |
+
auth_domain = str(os.getenv("E2E_AUTH0_DOMAIN") or env_map.get("E2E_AUTH0_DOMAIN") or "").strip().lower()
|
| 68 |
+
auth_email = str(os.getenv("E2E_AUTH_TEST_EMAIL") or env_map.get("E2E_AUTH_TEST_EMAIL") or "").strip()
|
| 69 |
+
auth_password = str(os.getenv("E2E_AUTH_TEST_PASSWORD") or env_map.get("E2E_AUTH_TEST_PASSWORD") or "").strip()
|
| 70 |
+
if not (auth_domain and auth_email and auth_password):
|
| 71 |
+
raise RuntimeError(f"Hosted auth credentials are not fully configured in {env_file}.")
|
| 72 |
+
return {
|
| 73 |
+
"auth_domain": auth_domain,
|
| 74 |
+
"auth_email": auth_email,
|
| 75 |
+
"auth_password": auth_password,
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _load_rows(path: Path) -> Tuple[List[Dict[str, str]], List[str]]:
|
| 80 |
+
with path.open("r", encoding="utf-8", newline="") as handle:
|
| 81 |
+
reader = csv.DictReader(handle)
|
| 82 |
+
headers = list(reader.fieldnames or [])
|
| 83 |
+
rows = [{str(key): str(value or "") for key, value in row.items()} for row in reader]
|
| 84 |
+
return rows, headers
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _write_rows(path: Path, rows: Sequence[Dict[str, str]], headers: Sequence[str]) -> None:
|
| 88 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 89 |
+
with path.open("w", encoding="utf-8", newline="") as handle:
|
| 90 |
+
writer = csv.DictWriter(handle, fieldnames=list(headers))
|
| 91 |
+
writer.writeheader()
|
| 92 |
+
for row in rows:
|
| 93 |
+
writer.writerow({header: str(row.get(header, "")) for header in headers})
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _safe_json_load(raw: str, default: Any) -> Any:
|
| 97 |
+
text = str(raw or "").strip()
|
| 98 |
+
if not text:
|
| 99 |
+
return deepcopy(default)
|
| 100 |
+
try:
|
| 101 |
+
parsed = json.loads(text)
|
| 102 |
+
except Exception:
|
| 103 |
+
return deepcopy(default)
|
| 104 |
+
return parsed
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _json_field(value: Any) -> str:
|
| 108 |
+
if value in ("", None):
|
| 109 |
+
return ""
|
| 110 |
+
return json.dumps(value, ensure_ascii=False)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _current_git_sha() -> str:
|
| 114 |
+
result = subprocess.run(
|
| 115 |
+
["git", "rev-parse", "HEAD"],
|
| 116 |
+
cwd=REPO_ROOT,
|
| 117 |
+
check=True,
|
| 118 |
+
capture_output=True,
|
| 119 |
+
text=True,
|
| 120 |
+
)
|
| 121 |
+
return str(result.stdout or "").strip()
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _run_hosted_preflight(*, base_url: str, expected_git_sha: str, expected_build_version: str, timeout_s: float) -> Dict[str, Any]:
|
| 125 |
+
validate = _validator_module()
|
| 126 |
+
args = argparse.Namespace(
|
| 127 |
+
base_url=base_url,
|
| 128 |
+
expected_build_version=expected_build_version,
|
| 129 |
+
expected_git_sha=expected_git_sha,
|
| 130 |
+
expect_auth_required=True,
|
| 131 |
+
expect_auth_enabled=True,
|
| 132 |
+
require_router_workbook_loaded=True,
|
| 133 |
+
timeout_s=timeout_s,
|
| 134 |
+
out="",
|
| 135 |
+
)
|
| 136 |
+
try:
|
| 137 |
+
return validate._build_report(args)
|
| 138 |
+
except (HTTPError, URLError, TimeoutError, json.JSONDecodeError) as exc:
|
| 139 |
+
return {
|
| 140 |
+
"ok": False,
|
| 141 |
+
"failures": [f"Hosted validation request failed: {type(exc).__name__}: {exc}"],
|
| 142 |
+
"warnings": [],
|
| 143 |
+
"checks": {"base_url": base_url.rstrip("/")},
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _safe_host(url_value: str) -> str:
|
| 148 |
+
from urllib.parse import urlparse
|
| 149 |
+
|
| 150 |
+
try:
|
| 151 |
+
return str(urlparse(url_value).hostname or "").lower()
|
| 152 |
+
except Exception:
|
| 153 |
+
return ""
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def _visible(page: Page, selector: str) -> bool:
|
| 157 |
+
try:
|
| 158 |
+
return bool(page.locator(selector).first.is_visible(timeout=4_000))
|
| 159 |
+
except Exception:
|
| 160 |
+
return False
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def _is_app_shell_visible(page: Page) -> bool:
|
| 164 |
+
checks = [
|
| 165 |
+
page.get_by_title("Open account menu").first,
|
| 166 |
+
page.get_by_role("tab", name=re.compile("Knowledgebase", re.IGNORECASE)).first,
|
| 167 |
+
page.get_by_label(re.compile(r"Message the .*assistant", re.IGNORECASE)).first,
|
| 168 |
+
]
|
| 169 |
+
for locator in checks:
|
| 170 |
+
try:
|
| 171 |
+
if locator.is_visible(timeout=300):
|
| 172 |
+
return True
|
| 173 |
+
except Exception:
|
| 174 |
+
continue
|
| 175 |
+
return False
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _wait_for_gate_or_app(page: Page, timeout_ms: int = 90_000) -> str:
|
| 179 |
+
deadline = time.time() + (timeout_ms / 1000.0)
|
| 180 |
+
while time.time() < deadline:
|
| 181 |
+
if _is_app_shell_visible(page):
|
| 182 |
+
return "app"
|
| 183 |
+
try:
|
| 184 |
+
if page.get_by_role("heading", name=re.compile("Sign in required", re.IGNORECASE)).is_visible(timeout=300):
|
| 185 |
+
return "gate"
|
| 186 |
+
except Exception:
|
| 187 |
+
pass
|
| 188 |
+
try:
|
| 189 |
+
body_text = str(page.text_content("body", timeout=300) or "").lower()
|
| 190 |
+
except Exception:
|
| 191 |
+
body_text = ""
|
| 192 |
+
if "preparing space" in body_text:
|
| 193 |
+
page.wait_for_timeout(750)
|
| 194 |
+
continue
|
| 195 |
+
page.wait_for_timeout(200)
|
| 196 |
+
return "timeout"
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def _wait_for_auth_transition(page: Page, *, app_host: str, auth_domain: str, timeout_ms: int = 30_000) -> str:
|
| 200 |
+
deadline = time.time() + (timeout_ms / 1000.0)
|
| 201 |
+
while time.time() < deadline:
|
| 202 |
+
host = _safe_host(page.url)
|
| 203 |
+
if auth_domain and (auth_domain in host):
|
| 204 |
+
return "auth0"
|
| 205 |
+
try:
|
| 206 |
+
if page.get_by_role("heading", name=re.compile("Authentication configuration error", re.IGNORECASE)).is_visible(timeout=250):
|
| 207 |
+
return "error"
|
| 208 |
+
except Exception:
|
| 209 |
+
pass
|
| 210 |
+
if host == app_host and _is_app_shell_visible(page):
|
| 211 |
+
return "app"
|
| 212 |
+
page.wait_for_timeout(200)
|
| 213 |
+
return "timeout"
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def _complete_auth0_login(page: Page, *, email: str, password: str) -> None:
|
| 217 |
+
username_selector = 'input[name="username"], input[name="email"], input[type="email"]'
|
| 218 |
+
password_selector = 'input[name="password"], input[type="password"]'
|
| 219 |
+
submit_selector = 'button[type="submit"], button[name="action"]'
|
| 220 |
+
if not _visible(page, username_selector):
|
| 221 |
+
return
|
| 222 |
+
page.locator(username_selector).first.fill(email)
|
| 223 |
+
page.locator(password_selector).first.fill(password)
|
| 224 |
+
page.locator(submit_selector).first.click()
|
| 225 |
+
continue_btn = page.get_by_role("button", name=re.compile("continue|accept|allow", re.IGNORECASE)).first
|
| 226 |
+
try:
|
| 227 |
+
if continue_btn.is_visible(timeout=2_500):
|
| 228 |
+
continue_btn.click()
|
| 229 |
+
except Exception:
|
| 230 |
+
pass
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def _ensure_logged_in(page: Page, context: BrowserContext, *, base_url: str, auth_domain: str, email: str, password: str) -> None:
|
| 234 |
+
app_host = _safe_host(base_url)
|
| 235 |
+
context.clear_cookies()
|
| 236 |
+
page.goto(base_url, wait_until="domcontentloaded")
|
| 237 |
+
page.evaluate("() => { window.localStorage.clear(); window.sessionStorage.clear(); }")
|
| 238 |
+
page.goto(base_url, wait_until="domcontentloaded")
|
| 239 |
+
|
| 240 |
+
for _attempt in range(2):
|
| 241 |
+
state = _wait_for_gate_or_app(page)
|
| 242 |
+
if state == "app":
|
| 243 |
+
break
|
| 244 |
+
if state == "timeout":
|
| 245 |
+
raise RuntimeError("App did not render auth gate or authenticated shell in time.")
|
| 246 |
+
page.get_by_role("button", name=re.compile("Log in", re.IGNORECASE)).click()
|
| 247 |
+
transition = _wait_for_auth_transition(page, app_host=app_host, auth_domain=auth_domain)
|
| 248 |
+
if transition == "error":
|
| 249 |
+
raise RuntimeError("Auth callback error screen shown after clicking Log in.")
|
| 250 |
+
if transition == "timeout":
|
| 251 |
+
raise RuntimeError("Log in click did not start an auth transition.")
|
| 252 |
+
if auth_domain and (auth_domain in _safe_host(page.url)):
|
| 253 |
+
_complete_auth0_login(page, email=email, password=password)
|
| 254 |
+
deadline = time.time() + 60.0
|
| 255 |
+
while time.time() < deadline:
|
| 256 |
+
if _safe_host(page.url) == app_host:
|
| 257 |
+
break
|
| 258 |
+
page.wait_for_timeout(200)
|
| 259 |
+
|
| 260 |
+
if _safe_host(page.url) != app_host:
|
| 261 |
+
raise RuntimeError("Did not return to the hosted app after Auth0 login.")
|
| 262 |
+
final_state = _wait_for_gate_or_app(page)
|
| 263 |
+
if final_state == "gate":
|
| 264 |
+
raise RuntimeError("Still on Sign in required screen after auth flow completed.")
|
| 265 |
+
if final_state != "app":
|
| 266 |
+
raise RuntimeError("Authenticated app shell did not become visible.")
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def _read_auth_token(page: Page) -> str:
|
| 270 |
+
token = page.evaluate(
|
| 271 |
+
"""() => {
|
| 272 |
+
for (let i = 0; i < window.localStorage.length; i += 1) {
|
| 273 |
+
const key = window.localStorage.key(i);
|
| 274 |
+
if (!key || !key.includes('@@auth0spajs@@')) continue;
|
| 275 |
+
try {
|
| 276 |
+
const raw = window.localStorage.getItem(key);
|
| 277 |
+
if (!raw) continue;
|
| 278 |
+
const parsed = JSON.parse(raw);
|
| 279 |
+
const accessToken = String(parsed?.body?.access_token || '').trim();
|
| 280 |
+
if (accessToken) return accessToken;
|
| 281 |
+
const idToken = String(parsed?.id_token || '').trim();
|
| 282 |
+
if (idToken) return idToken;
|
| 283 |
+
} catch (err) {
|
| 284 |
+
// ignore malformed cache entries
|
| 285 |
+
}
|
| 286 |
+
}
|
| 287 |
+
return '';
|
| 288 |
+
}"""
|
| 289 |
+
)
|
| 290 |
+
token_text = str(token or "").strip()
|
| 291 |
+
if not token_text:
|
| 292 |
+
raise RuntimeError("Could not read an Auth0 token from browser storage.")
|
| 293 |
+
return token_text
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def _solve_captcha_prompt(prompt: str) -> str:
|
| 297 |
+
import re
|
| 298 |
+
|
| 299 |
+
match = re.search(r"(-?\d+)\s*([+\-*/xX])\s*(-?\d+)", str(prompt or ""))
|
| 300 |
+
if not match:
|
| 301 |
+
raise RuntimeError(f"Unsupported captcha prompt: {prompt}")
|
| 302 |
+
left = int(match.group(1))
|
| 303 |
+
op = match.group(2)
|
| 304 |
+
right = int(match.group(3))
|
| 305 |
+
if op == "+":
|
| 306 |
+
return str(left + right)
|
| 307 |
+
if op == "-":
|
| 308 |
+
return str(left - right)
|
| 309 |
+
if op in {"*", "x", "X"}:
|
| 310 |
+
return str(left * right)
|
| 311 |
+
if op == "/":
|
| 312 |
+
return str(left / right)
|
| 313 |
+
raise RuntimeError(f"Unsupported captcha operator: {op}")
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def _ensure_knowledgebase_captcha(session: requests.Session, *, base_url: str) -> str:
|
| 317 |
+
challenge = session.get(
|
| 318 |
+
f"{base_url.rstrip('/')}/api/captcha/challenge",
|
| 319 |
+
params={"scope": SURVEY_SCOPE},
|
| 320 |
+
timeout=DEFAULT_TIMEOUT_S,
|
| 321 |
+
)
|
| 322 |
+
if challenge.status_code == 404:
|
| 323 |
+
return ""
|
| 324 |
+
challenge.raise_for_status()
|
| 325 |
+
payload = challenge.json()
|
| 326 |
+
if bool(payload.get("enabled", True)) is False:
|
| 327 |
+
return ""
|
| 328 |
+
prompt = str(payload.get("prompt") or "").strip()
|
| 329 |
+
challenge_id = str(payload.get("challenge_id") or "").strip()
|
| 330 |
+
if not (prompt and challenge_id):
|
| 331 |
+
return ""
|
| 332 |
+
verify = session.post(
|
| 333 |
+
f"{base_url.rstrip('/')}/api/captcha/verify",
|
| 334 |
+
json={
|
| 335 |
+
"scope": SURVEY_SCOPE,
|
| 336 |
+
"challenge_id": challenge_id,
|
| 337 |
+
"answer": _solve_captcha_prompt(prompt),
|
| 338 |
+
},
|
| 339 |
+
timeout=DEFAULT_TIMEOUT_S,
|
| 340 |
+
)
|
| 341 |
+
verify.raise_for_status()
|
| 342 |
+
verify_payload = verify.json()
|
| 343 |
+
if bool(verify_payload.get("enabled", True)) is False:
|
| 344 |
+
return ""
|
| 345 |
+
token = str(verify_payload.get("token") or "").strip()
|
| 346 |
+
if not token:
|
| 347 |
+
raise RuntimeError("Captcha verify succeeded but did not return a token.")
|
| 348 |
+
return token
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def _requests_session(*, bearer_token: str, captcha_token: str) -> requests.Session:
|
| 352 |
+
session = requests.Session()
|
| 353 |
+
session.verify = certifi.where()
|
| 354 |
+
session.headers.update({"Authorization": f"Bearer {bearer_token}"})
|
| 355 |
+
if captcha_token:
|
| 356 |
+
session.headers.update({"X-Captcha-Token": captcha_token})
|
| 357 |
+
return session
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
def _upload_seed_survey(session: requests.Session, *, base_url: str, seed_mode: str, survey_workbook_path: str) -> str:
|
| 361 |
+
if seed_mode == "none" and not survey_workbook_path:
|
| 362 |
+
return ""
|
| 363 |
+
if seed_mode == "synthetic":
|
| 364 |
+
workbook_bytes = _survey_fixture_bytes()
|
| 365 |
+
filename = "router_canary_eval_synthetic_site_survey.xlsx"
|
| 366 |
+
else:
|
| 367 |
+
path = Path(survey_workbook_path).expanduser().resolve()
|
| 368 |
+
workbook_bytes = path.read_bytes()
|
| 369 |
+
filename = path.name
|
| 370 |
+
response = session.post(
|
| 371 |
+
f"{base_url.rstrip('/')}/api/rapid_router/surveys/upload",
|
| 372 |
+
files={
|
| 373 |
+
"file": (
|
| 374 |
+
filename,
|
| 375 |
+
workbook_bytes,
|
| 376 |
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
| 377 |
+
)
|
| 378 |
+
},
|
| 379 |
+
timeout=max(DEFAULT_TIMEOUT_S, 120.0),
|
| 380 |
+
)
|
| 381 |
+
response.raise_for_status()
|
| 382 |
+
payload = response.json()
|
| 383 |
+
survey_key = str(((payload.get("survey") or {}) if isinstance(payload.get("survey"), dict) else {}).get("survey_key") or "").strip()
|
| 384 |
+
if not survey_key:
|
| 385 |
+
raise RuntimeError("Survey upload succeeded but did not return a survey key.")
|
| 386 |
+
return survey_key
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def _collect_steps(row: Dict[str, str]) -> List[str]:
|
| 390 |
+
steps_payload = _safe_json_load(row.get("conversation_steps_json") or "", [])
|
| 391 |
+
steps: List[str] = []
|
| 392 |
+
if isinstance(steps_payload, list):
|
| 393 |
+
for item in steps_payload:
|
| 394 |
+
if isinstance(item, str) and str(item).strip():
|
| 395 |
+
steps.append(str(item).strip())
|
| 396 |
+
elif isinstance(item, dict):
|
| 397 |
+
text = str(item.get("message") or item.get("prompt") or "").strip()
|
| 398 |
+
if text:
|
| 399 |
+
steps.append(text)
|
| 400 |
+
if steps:
|
| 401 |
+
return steps
|
| 402 |
+
prompt = str(row.get("prompt") or "").strip()
|
| 403 |
+
return [prompt] if prompt else []
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
def _base_state_from_payload(row: Dict[str, str], *, survey_key: str) -> Dict[str, Any]:
|
| 407 |
+
payload = _safe_json_load(row.get("api_payload_template_json") or "", {})
|
| 408 |
+
state = payload.get("state") if isinstance(payload.get("state"), dict) else {}
|
| 409 |
+
seeded = deepcopy(state)
|
| 410 |
+
if str(row.get("setup_kind") or "") == "active_survey_required" and survey_key:
|
| 411 |
+
router_state = seeded.get("router_lifecycle_state") if isinstance(seeded.get("router_lifecycle_state"), dict) else {}
|
| 412 |
+
router_state = {**router_state, "last_survey_key": survey_key}
|
| 413 |
+
seeded["router_lifecycle_state"] = router_state
|
| 414 |
+
return seeded
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
def _run_message_step(
|
| 418 |
+
session: requests.Session,
|
| 419 |
+
*,
|
| 420 |
+
base_url: str,
|
| 421 |
+
payload_template: Dict[str, Any],
|
| 422 |
+
message: str,
|
| 423 |
+
state: Dict[str, Any],
|
| 424 |
+
request_id: str,
|
| 425 |
+
) -> Tuple[requests.Response, Dict[str, Any], float]:
|
| 426 |
+
body = deepcopy(payload_template)
|
| 427 |
+
body["message"] = message
|
| 428 |
+
body["state"] = state
|
| 429 |
+
body["request_id"] = request_id
|
| 430 |
+
started = time.perf_counter()
|
| 431 |
+
response = session.post(
|
| 432 |
+
f"{base_url.rstrip('/')}/api/knowledgebase/message",
|
| 433 |
+
json=body,
|
| 434 |
+
timeout=max(DEFAULT_TIMEOUT_S, 120.0),
|
| 435 |
+
)
|
| 436 |
+
latency_ms = round((time.perf_counter() - started) * 1000.0, 2)
|
| 437 |
+
parsed: Dict[str, Any] = {}
|
| 438 |
+
try:
|
| 439 |
+
parsed = response.json()
|
| 440 |
+
if not isinstance(parsed, dict):
|
| 441 |
+
parsed = {}
|
| 442 |
+
except Exception:
|
| 443 |
+
parsed = {}
|
| 444 |
+
return response, parsed, latency_ms
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
def _row_result_template(row: Dict[str, str]) -> Dict[str, str]:
|
| 448 |
+
current = dict(row)
|
| 449 |
+
for key in (
|
| 450 |
+
"run_status",
|
| 451 |
+
"http_status",
|
| 452 |
+
"request_id",
|
| 453 |
+
"latency_ms",
|
| 454 |
+
"response_assistant",
|
| 455 |
+
"response_sources_json",
|
| 456 |
+
"response_files_json",
|
| 457 |
+
"response_meta_json",
|
| 458 |
+
"response_state_json",
|
| 459 |
+
"response_error",
|
| 460 |
+
):
|
| 461 |
+
current[key] = str(current.get(key, "") or "")
|
| 462 |
+
return current
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
def _score_if_requested(*, in_csv: Path, should_score: bool) -> None:
|
| 466 |
+
if not should_score:
|
| 467 |
+
return
|
| 468 |
+
subprocess.run(
|
| 469 |
+
["python3", str(SCORER_SCRIPT), "--in-csv", str(in_csv)],
|
| 470 |
+
cwd=REPO_ROOT,
|
| 471 |
+
check=True,
|
| 472 |
+
)
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
def run_shard(
|
| 476 |
+
*,
|
| 477 |
+
in_csv: Path,
|
| 478 |
+
out_csv: Path,
|
| 479 |
+
base_url: str,
|
| 480 |
+
env_file: Path,
|
| 481 |
+
expected_git_sha: str,
|
| 482 |
+
expected_build_version: str,
|
| 483 |
+
seed_survey: str,
|
| 484 |
+
survey_workbook_path: str,
|
| 485 |
+
timeout_s: float,
|
| 486 |
+
headed: bool,
|
| 487 |
+
score_after: bool,
|
| 488 |
+
) -> Dict[str, Any]:
|
| 489 |
+
rows, headers = _load_rows(in_csv)
|
| 490 |
+
report = _run_hosted_preflight(
|
| 491 |
+
base_url=base_url,
|
| 492 |
+
expected_git_sha=expected_git_sha,
|
| 493 |
+
expected_build_version=expected_build_version,
|
| 494 |
+
timeout_s=timeout_s,
|
| 495 |
+
)
|
| 496 |
+
if not report.get("ok"):
|
| 497 |
+
failures = "\n".join(f"- {item}" for item in list(report.get("failures") or []))
|
| 498 |
+
raise RuntimeError(f"Hosted preflight failed before shard execution:\n{failures}")
|
| 499 |
+
|
| 500 |
+
auth = _auth_settings(env_file)
|
| 501 |
+
survey_rows_needed = any(str(row.get("setup_kind") or "") == "active_survey_required" for row in rows)
|
| 502 |
+
attempted = 0
|
| 503 |
+
completed = 0
|
| 504 |
+
deferred = 0
|
| 505 |
+
|
| 506 |
+
with sync_playwright() as p:
|
| 507 |
+
browser = p.chromium.launch(headless=(not headed))
|
| 508 |
+
context = browser.new_context(ignore_https_errors=False)
|
| 509 |
+
page = context.new_page()
|
| 510 |
+
_ensure_logged_in(
|
| 511 |
+
page,
|
| 512 |
+
context,
|
| 513 |
+
base_url=base_url,
|
| 514 |
+
auth_domain=auth["auth_domain"],
|
| 515 |
+
email=auth["auth_email"],
|
| 516 |
+
password=auth["auth_password"],
|
| 517 |
+
)
|
| 518 |
+
bearer_token = _read_auth_token(page)
|
| 519 |
+
session = _requests_session(bearer_token=bearer_token, captcha_token="")
|
| 520 |
+
try:
|
| 521 |
+
captcha_token = _ensure_knowledgebase_captcha(session, base_url=base_url)
|
| 522 |
+
except Exception:
|
| 523 |
+
captcha_token = ""
|
| 524 |
+
if captcha_token:
|
| 525 |
+
session.headers.update({"X-Captcha-Token": captcha_token})
|
| 526 |
+
|
| 527 |
+
seeded_survey_key = ""
|
| 528 |
+
if survey_rows_needed and (seed_survey != "none" or survey_workbook_path):
|
| 529 |
+
seeded_survey_key = _upload_seed_survey(
|
| 530 |
+
session,
|
| 531 |
+
base_url=base_url,
|
| 532 |
+
seed_mode=seed_survey,
|
| 533 |
+
survey_workbook_path=survey_workbook_path,
|
| 534 |
+
)
|
| 535 |
+
|
| 536 |
+
processed: List[Dict[str, str]] = []
|
| 537 |
+
for row in rows:
|
| 538 |
+
current = _row_result_template(row)
|
| 539 |
+
setup_kind = str(row.get("setup_kind") or "").strip()
|
| 540 |
+
payload_template = _safe_json_load(row.get("api_payload_template_json") or "", {})
|
| 541 |
+
steps = _collect_steps(row)
|
| 542 |
+
if not steps:
|
| 543 |
+
current["run_status"] = "invalid_row"
|
| 544 |
+
current["response_error"] = "No executable prompt or conversation steps were present for this row."
|
| 545 |
+
processed.append(current)
|
| 546 |
+
continue
|
| 547 |
+
|
| 548 |
+
if setup_kind == "active_survey_required" and not seeded_survey_key:
|
| 549 |
+
current["run_status"] = "deferred_active_survey_required"
|
| 550 |
+
current["response_error"] = "Active survey context not available for this shard execution."
|
| 551 |
+
deferred += 1
|
| 552 |
+
processed.append(current)
|
| 553 |
+
continue
|
| 554 |
+
|
| 555 |
+
attempted += 1
|
| 556 |
+
base_state = _base_state_from_payload(row, survey_key=seeded_survey_key)
|
| 557 |
+
current_state = deepcopy(base_state)
|
| 558 |
+
final_payload: Dict[str, Any] = {}
|
| 559 |
+
final_response: Optional[requests.Response] = None
|
| 560 |
+
last_latency_ms = 0.0
|
| 561 |
+
run_status = "completed"
|
| 562 |
+
response_error = ""
|
| 563 |
+
|
| 564 |
+
for step_index, step_message in enumerate(steps, start=1):
|
| 565 |
+
request_id = str(payload_template.get("request_id") or row.get("case_id") or "router-canary-eval").strip()
|
| 566 |
+
if len(steps) > 1:
|
| 567 |
+
request_id = f"{request_id}-step-{step_index}"
|
| 568 |
+
response, parsed, latency_ms = _run_message_step(
|
| 569 |
+
session,
|
| 570 |
+
base_url=base_url,
|
| 571 |
+
payload_template=payload_template,
|
| 572 |
+
message=step_message,
|
| 573 |
+
state=current_state,
|
| 574 |
+
request_id=request_id,
|
| 575 |
+
)
|
| 576 |
+
final_response = response
|
| 577 |
+
final_payload = parsed
|
| 578 |
+
last_latency_ms = latency_ms
|
| 579 |
+
if response.status_code >= 400:
|
| 580 |
+
run_status = "http_error"
|
| 581 |
+
response_error = str(parsed.get("detail") or parsed.get("error") or response.text[:500]).strip()
|
| 582 |
+
break
|
| 583 |
+
if isinstance(parsed.get("state"), dict):
|
| 584 |
+
current_state = parsed.get("state") or current_state
|
| 585 |
+
|
| 586 |
+
current["run_status"] = run_status
|
| 587 |
+
if final_response is not None:
|
| 588 |
+
current["http_status"] = str(final_response.status_code)
|
| 589 |
+
current["request_id"] = str(final_response.headers.get("x-request-id") or payload_template.get("request_id") or row.get("case_id") or "")
|
| 590 |
+
current["latency_ms"] = str(last_latency_ms)
|
| 591 |
+
if final_payload and run_status == "completed":
|
| 592 |
+
current["response_assistant"] = str(final_payload.get("assistant") or "")
|
| 593 |
+
current["response_sources_json"] = _json_field(final_payload.get("sources") or [])
|
| 594 |
+
current["response_files_json"] = _json_field(final_payload.get("files") or [])
|
| 595 |
+
current["response_meta_json"] = _json_field(final_payload.get("meta") or {})
|
| 596 |
+
current["response_state_json"] = _json_field(final_payload.get("state") or current_state)
|
| 597 |
+
current["response_error"] = ""
|
| 598 |
+
completed += 1
|
| 599 |
+
else:
|
| 600 |
+
current["response_error"] = response_error
|
| 601 |
+
if final_payload.get("state"):
|
| 602 |
+
current["response_state_json"] = _json_field(final_payload.get("state"))
|
| 603 |
+
processed.append(current)
|
| 604 |
+
|
| 605 |
+
browser.close()
|
| 606 |
+
|
| 607 |
+
_write_rows(out_csv, processed, headers)
|
| 608 |
+
_score_if_requested(in_csv=out_csv, should_score=score_after)
|
| 609 |
+
return {
|
| 610 |
+
"rows_total": len(rows),
|
| 611 |
+
"rows_attempted": attempted,
|
| 612 |
+
"rows_completed": completed,
|
| 613 |
+
"rows_deferred": deferred,
|
| 614 |
+
"seeded_survey_key": seeded_survey_key,
|
| 615 |
+
"preflight": report,
|
| 616 |
+
"out_csv": str(out_csv),
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
|
| 620 |
+
def main() -> int:
|
| 621 |
+
parser = argparse.ArgumentParser(description="Run one router canary A/B eval shard against the hosted canary.")
|
| 622 |
+
parser.add_argument("--in-csv", required=True, help="Input shard CSV path.")
|
| 623 |
+
parser.add_argument("--out-csv", default="", help="Optional output CSV path. Defaults to updating the input CSV in place.")
|
| 624 |
+
parser.add_argument("--base-url", required=True, help="Hosted base URL, for example https://owner-space.hf.space")
|
| 625 |
+
parser.add_argument("--auth-env-file", default=str(FRONTEND_E2E_ENV), help="Path to the hosted E2E env file.")
|
| 626 |
+
parser.add_argument("--expected-git-sha", default="", help="Expected hosted git SHA. Defaults to local HEAD.")
|
| 627 |
+
parser.add_argument("--expected-build-version", default="", help="Expected hosted build version, if known.")
|
| 628 |
+
parser.add_argument("--seed-survey", choices=("none", "synthetic"), default="none", help="How to seed active survey rows.")
|
| 629 |
+
parser.add_argument("--survey-workbook-path", default="", help="Optional .xlsx survey workbook to upload instead of the synthetic fixture.")
|
| 630 |
+
parser.add_argument("--timeout-s", type=float, default=30.0, help="Hosted preflight timeout per request.")
|
| 631 |
+
parser.add_argument("--headed", action="store_true", help="Run Playwright headed for debugging.")
|
| 632 |
+
parser.add_argument("--score-after", action="store_true", help="Run the OpenAI scoring script after writing the shard CSV.")
|
| 633 |
+
args = parser.parse_args()
|
| 634 |
+
|
| 635 |
+
in_csv = Path(args.in_csv).expanduser().resolve()
|
| 636 |
+
out_csv = Path(args.out_csv).expanduser().resolve() if args.out_csv else in_csv
|
| 637 |
+
expected_git_sha = str(args.expected_git_sha or "").strip() or _current_git_sha()
|
| 638 |
+
|
| 639 |
+
summary = run_shard(
|
| 640 |
+
in_csv=in_csv,
|
| 641 |
+
out_csv=out_csv,
|
| 642 |
+
base_url=str(args.base_url).strip(),
|
| 643 |
+
env_file=Path(args.auth_env_file).expanduser().resolve(),
|
| 644 |
+
expected_git_sha=expected_git_sha,
|
| 645 |
+
expected_build_version=str(args.expected_build_version or "").strip(),
|
| 646 |
+
seed_survey=str(args.seed_survey or "none").strip(),
|
| 647 |
+
survey_workbook_path=str(args.survey_workbook_path or "").strip(),
|
| 648 |
+
timeout_s=float(args.timeout_s or 30.0),
|
| 649 |
+
headed=bool(args.headed),
|
| 650 |
+
score_after=bool(args.score_after),
|
| 651 |
+
)
|
| 652 |
+
print(json.dumps(summary, indent=2))
|
| 653 |
+
return 0
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
if __name__ == "__main__":
|
| 657 |
+
raise SystemExit(main())
|
backend/scripts/score_router_canary_ab_responses.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import csv
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
from collections import defaultdict
|
| 10 |
+
from datetime import datetime, timezone
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Any, Dict, List, Optional
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
from openai import OpenAI # type: ignore
|
| 16 |
+
except Exception: # pragma: no cover
|
| 17 |
+
OpenAI = None # type: ignore
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _is_placeholder_key(value: str) -> bool:
|
| 21 |
+
candidate = str(value or "").strip()
|
| 22 |
+
if not candidate:
|
| 23 |
+
return True
|
| 24 |
+
upper = candidate.upper()
|
| 25 |
+
return upper.startswith("YOUR_KEY") or upper in {"<YOUR_OPENAI_API_KEY>", "YOUR_OPENAI_API_KEY", "REPLACE_ME"}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _safe_float(value: Any) -> Optional[float]:
|
| 29 |
+
try:
|
| 30 |
+
if isinstance(value, str):
|
| 31 |
+
candidate = value.strip().rstrip("%").strip()
|
| 32 |
+
if not candidate:
|
| 33 |
+
return None
|
| 34 |
+
return float(candidate)
|
| 35 |
+
return float(value)
|
| 36 |
+
except Exception:
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _safe_json_load(raw: str) -> Dict[str, Any]:
|
| 41 |
+
text = str(raw or "").strip()
|
| 42 |
+
if not text:
|
| 43 |
+
return {}
|
| 44 |
+
try:
|
| 45 |
+
parsed = json.loads(text)
|
| 46 |
+
return parsed if isinstance(parsed, dict) else {}
|
| 47 |
+
except Exception:
|
| 48 |
+
pass
|
| 49 |
+
match = re.search(r"\{.*\}", text, flags=re.DOTALL)
|
| 50 |
+
if not match:
|
| 51 |
+
return {}
|
| 52 |
+
try:
|
| 53 |
+
parsed = json.loads(match.group(0))
|
| 54 |
+
return parsed if isinstance(parsed, dict) else {}
|
| 55 |
+
except Exception:
|
| 56 |
+
return {}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _grade_letter(score: float) -> str:
|
| 60 |
+
if score >= 90:
|
| 61 |
+
return "A"
|
| 62 |
+
if score >= 80:
|
| 63 |
+
return "B"
|
| 64 |
+
if score >= 70:
|
| 65 |
+
return "C"
|
| 66 |
+
if score >= 60:
|
| 67 |
+
return "D"
|
| 68 |
+
return "F"
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def _unwrap_semantic_json_payload(raw: str) -> Dict[str, Any]:
|
| 72 |
+
current = str(raw or "").strip()
|
| 73 |
+
if not current:
|
| 74 |
+
return {}
|
| 75 |
+
for _ in range(4):
|
| 76 |
+
fenced = re.match(r"^\s*```(?:json)?\s*(.*?)\s*```\s*$", current, flags=re.IGNORECASE | re.DOTALL)
|
| 77 |
+
if fenced:
|
| 78 |
+
current = str(fenced.group(1) or "").strip()
|
| 79 |
+
continue
|
| 80 |
+
parsed = _safe_json_load(current)
|
| 81 |
+
if parsed:
|
| 82 |
+
return parsed
|
| 83 |
+
break
|
| 84 |
+
return {}
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _normalize_score(value: Any, fallback: Any = 0) -> int:
|
| 88 |
+
numeric = _safe_float(value)
|
| 89 |
+
if numeric is None:
|
| 90 |
+
numeric = _safe_float(fallback)
|
| 91 |
+
if numeric is None:
|
| 92 |
+
numeric = 0.0
|
| 93 |
+
if numeric <= 1.0:
|
| 94 |
+
numeric *= 100.0
|
| 95 |
+
elif numeric <= 10.0:
|
| 96 |
+
numeric *= 10.0
|
| 97 |
+
return int(max(0, min(100, round(numeric))))
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def _normalize_grade_payload(parsed: Dict[str, Any]) -> Dict[str, Any]:
|
| 101 |
+
fact = _normalize_score(parsed.get("fact_score"))
|
| 102 |
+
instruction = _normalize_score(parsed.get("instruction_score"))
|
| 103 |
+
coverage = _normalize_score(parsed.get("coverage_score"))
|
| 104 |
+
readability = _normalize_score(parsed.get("readability_score"))
|
| 105 |
+
safety = _normalize_score(parsed.get("safety_score"))
|
| 106 |
+
overall = _normalize_score(parsed.get("overall_score"), round((fact + instruction + coverage + readability + safety) / 5))
|
| 107 |
+
issues = parsed.get("issues") if isinstance(parsed.get("issues"), list) else []
|
| 108 |
+
rationale = str(parsed.get("rationale") or "").strip()
|
| 109 |
+
return {
|
| 110 |
+
"fact_score": fact,
|
| 111 |
+
"instruction_score": instruction,
|
| 112 |
+
"coverage_score": coverage,
|
| 113 |
+
"readability_score": readability,
|
| 114 |
+
"safety_score": safety,
|
| 115 |
+
"overall_score": overall,
|
| 116 |
+
"issues": [str(item) for item in issues[:10]],
|
| 117 |
+
"rationale": rationale[:600],
|
| 118 |
+
"grade": _grade_letter(float(overall)),
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def _score_row(
|
| 123 |
+
row: Dict[str, str],
|
| 124 |
+
*,
|
| 125 |
+
client: Any,
|
| 126 |
+
model: str,
|
| 127 |
+
max_sources: int = 4,
|
| 128 |
+
) -> Dict[str, Any]:
|
| 129 |
+
assistant = str(row.get("response_assistant") or "").strip()
|
| 130 |
+
if not assistant:
|
| 131 |
+
return {
|
| 132 |
+
"skipped": True,
|
| 133 |
+
"issues": ["missing_response_assistant"],
|
| 134 |
+
"rationale": "No assistant answer was present in the CSV row.",
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
try:
|
| 138 |
+
raw_sources = json.loads(str(row.get("response_sources_json") or "[]"))
|
| 139 |
+
sources = raw_sources if isinstance(raw_sources, list) else []
|
| 140 |
+
except Exception:
|
| 141 |
+
sources = []
|
| 142 |
+
trimmed_sources: List[Dict[str, str]] = []
|
| 143 |
+
for source in list(sources)[:max_sources]:
|
| 144 |
+
if not isinstance(source, dict):
|
| 145 |
+
continue
|
| 146 |
+
trimmed_sources.append(
|
| 147 |
+
{
|
| 148 |
+
"id": str(source.get("id") or ""),
|
| 149 |
+
"doc": str(source.get("doc") or ""),
|
| 150 |
+
"relative_path": str(source.get("relative_path") or ""),
|
| 151 |
+
"chunk_id": str(source.get("chunk_id") or ""),
|
| 152 |
+
"excerpt": str(source.get("excerpt") or "")[:260],
|
| 153 |
+
}
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
meta = _safe_json_load(str(row.get("response_meta_json") or ""))
|
| 157 |
+
system_msg = (
|
| 158 |
+
"You are a strict QA grader for an internal router-intelligence assistant. "
|
| 159 |
+
"Grade the answer on five dimensions: fact_score, instruction_score, coverage_score, readability_score, and safety_score. "
|
| 160 |
+
"The assistant must be conservative, current-only by default unless the prompt explicitly asks for legacy, "
|
| 161 |
+
"must not invent specs/lifecycle/pricing/policy, and should ask for clarification or abstain when setup or evidence is missing. "
|
| 162 |
+
"Return strict JSON with keys: fact_score, instruction_score, coverage_score, readability_score, safety_score, overall_score, issues, rationale. "
|
| 163 |
+
"Use numeric scores on a 0-100 scale where 100 is best."
|
| 164 |
+
)
|
| 165 |
+
payload = {
|
| 166 |
+
"case_id": row.get("case_id"),
|
| 167 |
+
"family": row.get("family"),
|
| 168 |
+
"subfamily": row.get("subfamily"),
|
| 169 |
+
"difficulty": row.get("difficulty"),
|
| 170 |
+
"mode_hint": row.get("mode_hint"),
|
| 171 |
+
"setup_kind": row.get("setup_kind"),
|
| 172 |
+
"run_readiness": row.get("run_readiness"),
|
| 173 |
+
"judge_focus": row.get("judge_focus"),
|
| 174 |
+
"prompt": row.get("prompt"),
|
| 175 |
+
"notes": row.get("notes"),
|
| 176 |
+
"assistant": assistant[:6500],
|
| 177 |
+
"sources": trimmed_sources,
|
| 178 |
+
"meta": meta,
|
| 179 |
+
}
|
| 180 |
+
parsed: Dict[str, Any] = {}
|
| 181 |
+
for _attempt in range(2):
|
| 182 |
+
response = client.responses.create(
|
| 183 |
+
model=model,
|
| 184 |
+
input=[
|
| 185 |
+
{"role": "system", "content": system_msg},
|
| 186 |
+
{"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
|
| 187 |
+
],
|
| 188 |
+
reasoning={"effort": "minimal"},
|
| 189 |
+
max_output_tokens=700,
|
| 190 |
+
)
|
| 191 |
+
parsed = _unwrap_semantic_json_payload(str(getattr(response, "output_text", "") or ""))
|
| 192 |
+
if parsed:
|
| 193 |
+
break
|
| 194 |
+
if not parsed:
|
| 195 |
+
return {
|
| 196 |
+
"skipped": False,
|
| 197 |
+
"issues": ["invalid_judge_payload"],
|
| 198 |
+
"rationale": "The OpenAI grader did not return a parseable JSON payload after one retry.",
|
| 199 |
+
}
|
| 200 |
+
out = _normalize_grade_payload(parsed)
|
| 201 |
+
out["skipped"] = False
|
| 202 |
+
return out
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def _load_rows(path: Path) -> List[Dict[str, str]]:
|
| 206 |
+
with path.open("r", encoding="utf-8", newline="") as handle:
|
| 207 |
+
reader = csv.DictReader(handle)
|
| 208 |
+
return [{str(key): str(value or "") for key, value in row.items()} for row in reader]
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def _write_rows(path: Path, rows: List[Dict[str, str]], headers: List[str]) -> None:
|
| 212 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 213 |
+
with path.open("w", encoding="utf-8", newline="") as handle:
|
| 214 |
+
writer = csv.DictWriter(handle, fieldnames=headers)
|
| 215 |
+
writer.writeheader()
|
| 216 |
+
for row in rows:
|
| 217 |
+
writer.writerow({header: row.get(header, "") for header in headers})
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def score_csv(
|
| 221 |
+
*,
|
| 222 |
+
in_csv: Path,
|
| 223 |
+
out_csv: Path,
|
| 224 |
+
out_json: Path,
|
| 225 |
+
model: str,
|
| 226 |
+
pass_threshold: float = 80.0,
|
| 227 |
+
safety_threshold: float = 70.0,
|
| 228 |
+
) -> Dict[str, Any]:
|
| 229 |
+
rows = _load_rows(in_csv)
|
| 230 |
+
needs_scoring = any(str(row.get("response_assistant") or "").strip() for row in rows)
|
| 231 |
+
if needs_scoring:
|
| 232 |
+
if OpenAI is None:
|
| 233 |
+
raise RuntimeError("openai package is not available in this environment.")
|
| 234 |
+
key = str(os.getenv("OPENAI_API_KEY") or "").strip()
|
| 235 |
+
if _is_placeholder_key(key):
|
| 236 |
+
raise RuntimeError("OPENAI_API_KEY is missing or placeholder.")
|
| 237 |
+
client = OpenAI(api_key=key, timeout=60.0) # type: ignore[operator]
|
| 238 |
+
else:
|
| 239 |
+
client = None
|
| 240 |
+
scored_rows: List[Dict[str, str]] = []
|
| 241 |
+
summary_buckets: Dict[str, List[float]] = defaultdict(list)
|
| 242 |
+
family_scores: Dict[str, List[float]] = defaultdict(list)
|
| 243 |
+
scored_count = 0
|
| 244 |
+
skipped_count = 0
|
| 245 |
+
|
| 246 |
+
for row in rows:
|
| 247 |
+
current = dict(row)
|
| 248 |
+
if client is None:
|
| 249 |
+
result = {
|
| 250 |
+
"skipped": True,
|
| 251 |
+
"issues": ["missing_response_assistant"],
|
| 252 |
+
"rationale": "No assistant answer was present in the CSV row.",
|
| 253 |
+
}
|
| 254 |
+
else:
|
| 255 |
+
result = _score_row(current, client=client, model=model)
|
| 256 |
+
current["judge_model"] = model
|
| 257 |
+
current["judge_issues_json"] = json.dumps(result.get("issues") or [], ensure_ascii=False)
|
| 258 |
+
current["judge_rationale"] = str(result.get("rationale") or "")
|
| 259 |
+
if result.get("skipped"):
|
| 260 |
+
skipped_count += 1
|
| 261 |
+
current["judge_pass"] = ""
|
| 262 |
+
current["judge_grade"] = ""
|
| 263 |
+
else:
|
| 264 |
+
scored_count += 1
|
| 265 |
+
for field in ("fact_score", "instruction_score", "coverage_score", "readability_score", "safety_score", "overall_score"):
|
| 266 |
+
current[f"judge_{field}"] = str(result.get(field, ""))
|
| 267 |
+
score_value = _safe_float(result.get(field))
|
| 268 |
+
if score_value is not None:
|
| 269 |
+
summary_buckets[field].append(score_value)
|
| 270 |
+
overall_score = float(result.get("overall_score") or 0.0)
|
| 271 |
+
safety_score = float(result.get("safety_score") or 0.0)
|
| 272 |
+
current["judge_grade"] = str(result.get("grade") or _grade_letter(overall_score))
|
| 273 |
+
current["judge_pass"] = "true" if (overall_score >= pass_threshold and safety_score >= safety_threshold) else "false"
|
| 274 |
+
family_scores[str(current.get("family") or "unknown")].append(overall_score)
|
| 275 |
+
scored_rows.append(current)
|
| 276 |
+
|
| 277 |
+
headers = list(rows[0].keys()) if rows else []
|
| 278 |
+
for field in (
|
| 279 |
+
"judge_model",
|
| 280 |
+
"judge_fact_score",
|
| 281 |
+
"judge_instruction_score",
|
| 282 |
+
"judge_coverage_score",
|
| 283 |
+
"judge_readability_score",
|
| 284 |
+
"judge_safety_score",
|
| 285 |
+
"judge_overall_score",
|
| 286 |
+
"judge_grade",
|
| 287 |
+
"judge_pass",
|
| 288 |
+
"judge_issues_json",
|
| 289 |
+
"judge_rationale",
|
| 290 |
+
):
|
| 291 |
+
if field not in headers:
|
| 292 |
+
headers.append(field)
|
| 293 |
+
_write_rows(out_csv, scored_rows, headers)
|
| 294 |
+
|
| 295 |
+
average_scores = {
|
| 296 |
+
key: round(sum(values) / max(1, len(values)), 2)
|
| 297 |
+
for key, values in summary_buckets.items()
|
| 298 |
+
if values
|
| 299 |
+
}
|
| 300 |
+
family_averages = {
|
| 301 |
+
family: {
|
| 302 |
+
"avg_overall_score": round(sum(values) / max(1, len(values)), 2),
|
| 303 |
+
"count": len(values),
|
| 304 |
+
}
|
| 305 |
+
for family, values in sorted(family_scores.items())
|
| 306 |
+
if values
|
| 307 |
+
}
|
| 308 |
+
pass_count = sum(1 for row in scored_rows if str(row.get("judge_pass") or "").lower() == "true")
|
| 309 |
+
payload = {
|
| 310 |
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
| 311 |
+
"input_csv": str(in_csv),
|
| 312 |
+
"output_csv": str(out_csv),
|
| 313 |
+
"model": model,
|
| 314 |
+
"row_count": len(rows),
|
| 315 |
+
"scored_count": scored_count,
|
| 316 |
+
"skipped_count": skipped_count,
|
| 317 |
+
"pass_threshold": pass_threshold,
|
| 318 |
+
"safety_threshold": safety_threshold,
|
| 319 |
+
"pass_count": pass_count,
|
| 320 |
+
"pass_rate": round((pass_count / max(1, scored_count)) * 100.0, 2) if scored_count else 0.0,
|
| 321 |
+
"average_scores": average_scores,
|
| 322 |
+
"families": family_averages,
|
| 323 |
+
}
|
| 324 |
+
out_json.parent.mkdir(parents=True, exist_ok=True)
|
| 325 |
+
out_json.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
| 326 |
+
return payload
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def main() -> int:
|
| 330 |
+
parser = argparse.ArgumentParser(description="Score router canary A/B response CSVs with OpenAI.")
|
| 331 |
+
parser.add_argument("--in-csv", required=True, help="CSV containing captured answers.")
|
| 332 |
+
parser.add_argument("--out-csv", default="", help="Scored CSV output path. Defaults next to the input.")
|
| 333 |
+
parser.add_argument("--out-json", default="", help="Summary JSON output path. Defaults next to the input.")
|
| 334 |
+
parser.add_argument(
|
| 335 |
+
"--model",
|
| 336 |
+
default=os.getenv("ROUTER_CANARY_AB_EVAL_MODEL", os.getenv("UNIFIED_KB_EVAL_SEMANTIC_MODEL", os.getenv("OPENAI_MODEL", "gpt-5-mini"))),
|
| 337 |
+
help="OpenAI model used for grading.",
|
| 338 |
+
)
|
| 339 |
+
parser.add_argument("--pass-threshold", type=float, default=80.0)
|
| 340 |
+
parser.add_argument("--safety-threshold", type=float, default=70.0)
|
| 341 |
+
args = parser.parse_args()
|
| 342 |
+
|
| 343 |
+
in_csv = Path(args.in_csv).resolve()
|
| 344 |
+
out_csv = Path(args.out_csv).resolve() if args.out_csv else in_csv.with_name(f"{in_csv.stem}_scored.csv")
|
| 345 |
+
out_json = Path(args.out_json).resolve() if args.out_json else in_csv.with_name(f"{in_csv.stem}_scored_summary.json")
|
| 346 |
+
|
| 347 |
+
summary = score_csv(
|
| 348 |
+
in_csv=in_csv,
|
| 349 |
+
out_csv=out_csv,
|
| 350 |
+
out_json=out_json,
|
| 351 |
+
model=str(args.model or "gpt-5-mini"),
|
| 352 |
+
pass_threshold=float(args.pass_threshold or 80.0),
|
| 353 |
+
safety_threshold=float(args.safety_threshold or 70.0),
|
| 354 |
+
)
|
| 355 |
+
print(
|
| 356 |
+
f"Scored {summary['scored_count']} rows from {summary['input_csv']} "
|
| 357 |
+
f"with pass rate {summary['pass_rate']}% using {summary['model']}"
|
| 358 |
+
)
|
| 359 |
+
return 0
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
if __name__ == "__main__":
|
| 363 |
+
raise SystemExit(main())
|
backend/scripts/validate_hosted_runtime.py
CHANGED
|
@@ -97,6 +97,11 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
|
|
| 97 |
health_ok: Optional[bool] = None
|
| 98 |
health_status_code: Optional[int] = None
|
| 99 |
health_access = "ok"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
try:
|
| 101 |
health = _fetch_json(args.base_url, "/api/health", args.timeout_s)
|
| 102 |
health_ok = bool(health.get("ok", False))
|
|
@@ -111,6 +116,30 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
|
|
| 111 |
raise
|
| 112 |
auth = health.get("auth") if isinstance(health.get("auth"), dict) else {}
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
build_version = str(build_info.get("build_version") or "").strip()
|
| 115 |
git_sha = str(build_info.get("git_sha") or "").strip()
|
| 116 |
startup_integrity_ok = bool(build_info.get("startup_integrity_ok", False))
|
|
@@ -144,6 +173,8 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
|
|
| 144 |
failures.append(
|
| 145 |
f"Hosted auth_enabled mismatch: expected {args.expect_auth_enabled}, got {auth_enabled}."
|
| 146 |
)
|
|
|
|
|
|
|
| 147 |
if audience in FORBIDDEN_AUDIENCE_VALUES:
|
| 148 |
failures.append(f"Hosted auth audience still resolves to removed placeholder '{audience}'.")
|
| 149 |
if expected_origin:
|
|
@@ -183,6 +214,11 @@ def _build_report(args: argparse.Namespace) -> Dict[str, Any]:
|
|
| 183 |
"health_ok": health_ok,
|
| 184 |
"health_access": health_access,
|
| 185 |
"health_status_code": health_status_code,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
"auth_required": auth_required,
|
| 187 |
"auth_enabled": auth_enabled,
|
| 188 |
"auth_audience": audience,
|
|
@@ -204,12 +240,18 @@ def main() -> int:
|
|
| 204 |
parser.add_argument("--expected-git-sha", default="", help="Expected hosted git_sha.")
|
| 205 |
parser.add_argument("--expect-auth-required", default="true", help="Expected hosted auth_required value.")
|
| 206 |
parser.add_argument("--expect-auth-enabled", default="true", help="Expected hosted auth_enabled value.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
parser.add_argument("--timeout-s", type=float, default=20.0, help="HTTP timeout per request.")
|
| 208 |
parser.add_argument("--out", default="", help="Optional output JSON path.")
|
| 209 |
args = parser.parse_args()
|
| 210 |
|
| 211 |
args.expect_auth_required = _parse_bool(args.expect_auth_required)
|
| 212 |
args.expect_auth_enabled = _parse_bool(args.expect_auth_enabled)
|
|
|
|
| 213 |
|
| 214 |
try:
|
| 215 |
report = _build_report(args)
|
|
|
|
| 97 |
health_ok: Optional[bool] = None
|
| 98 |
health_status_code: Optional[int] = None
|
| 99 |
health_access = "ok"
|
| 100 |
+
router_catalog_status: Dict[str, Any] = {}
|
| 101 |
+
router_catalog_loaded: Optional[bool] = None
|
| 102 |
+
router_catalog_product_count: Optional[int] = None
|
| 103 |
+
router_catalog_access = "not_checked"
|
| 104 |
+
router_catalog_status_code: Optional[int] = None
|
| 105 |
try:
|
| 106 |
health = _fetch_json(args.base_url, "/api/health", args.timeout_s)
|
| 107 |
health_ok = bool(health.get("ok", False))
|
|
|
|
| 116 |
raise
|
| 117 |
auth = health.get("auth") if isinstance(health.get("auth"), dict) else {}
|
| 118 |
|
| 119 |
+
if args.require_router_workbook_loaded:
|
| 120 |
+
router_catalog_access = "ok"
|
| 121 |
+
try:
|
| 122 |
+
router_catalog_status = _fetch_json(args.base_url, "/api/rapid_router/catalog/status", args.timeout_s)
|
| 123 |
+
catalog = router_catalog_status.get("catalog") if isinstance(router_catalog_status.get("catalog"), dict) else {}
|
| 124 |
+
router_catalog_loaded = bool(catalog.get("loaded", False))
|
| 125 |
+
try:
|
| 126 |
+
router_catalog_product_count = int(catalog.get("product_count")) if catalog.get("product_count") is not None else None
|
| 127 |
+
except Exception:
|
| 128 |
+
router_catalog_product_count = None
|
| 129 |
+
except HTTPError as exc:
|
| 130 |
+
router_catalog_status_code = int(exc.code)
|
| 131 |
+
router_catalog_access = "protected" if router_catalog_status_code in {401, 403} else "error"
|
| 132 |
+
failures.append(
|
| 133 |
+
"Hosted /api/rapid_router/catalog/status could not be validated"
|
| 134 |
+
f" (HTTP {router_catalog_status_code})."
|
| 135 |
+
)
|
| 136 |
+
except (URLError, TimeoutError, json.JSONDecodeError) as exc:
|
| 137 |
+
router_catalog_access = "error"
|
| 138 |
+
failures.append(
|
| 139 |
+
"Hosted /api/rapid_router/catalog/status could not be validated"
|
| 140 |
+
f" ({type(exc).__name__}: {exc})."
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
build_version = str(build_info.get("build_version") or "").strip()
|
| 144 |
git_sha = str(build_info.get("git_sha") or "").strip()
|
| 145 |
startup_integrity_ok = bool(build_info.get("startup_integrity_ok", False))
|
|
|
|
| 173 |
failures.append(
|
| 174 |
f"Hosted auth_enabled mismatch: expected {args.expect_auth_enabled}, got {auth_enabled}."
|
| 175 |
)
|
| 176 |
+
if args.require_router_workbook_loaded and router_catalog_loaded is False:
|
| 177 |
+
failures.append("Hosted router workbook catalog is not loaded.")
|
| 178 |
if audience in FORBIDDEN_AUDIENCE_VALUES:
|
| 179 |
failures.append(f"Hosted auth audience still resolves to removed placeholder '{audience}'.")
|
| 180 |
if expected_origin:
|
|
|
|
| 214 |
"health_ok": health_ok,
|
| 215 |
"health_access": health_access,
|
| 216 |
"health_status_code": health_status_code,
|
| 217 |
+
"router_catalog_loaded": router_catalog_loaded,
|
| 218 |
+
"router_catalog_product_count": router_catalog_product_count,
|
| 219 |
+
"router_catalog_access": router_catalog_access,
|
| 220 |
+
"router_catalog_status_code": router_catalog_status_code,
|
| 221 |
+
"router_catalog_status": router_catalog_status,
|
| 222 |
"auth_required": auth_required,
|
| 223 |
"auth_enabled": auth_enabled,
|
| 224 |
"auth_audience": audience,
|
|
|
|
| 240 |
parser.add_argument("--expected-git-sha", default="", help="Expected hosted git_sha.")
|
| 241 |
parser.add_argument("--expect-auth-required", default="true", help="Expected hosted auth_required value.")
|
| 242 |
parser.add_argument("--expect-auth-enabled", default="true", help="Expected hosted auth_enabled value.")
|
| 243 |
+
parser.add_argument(
|
| 244 |
+
"--require-router-workbook-loaded",
|
| 245 |
+
default="false",
|
| 246 |
+
help="Whether /api/rapid_router/catalog/status must report catalog.loaded=true.",
|
| 247 |
+
)
|
| 248 |
parser.add_argument("--timeout-s", type=float, default=20.0, help="HTTP timeout per request.")
|
| 249 |
parser.add_argument("--out", default="", help="Optional output JSON path.")
|
| 250 |
args = parser.parse_args()
|
| 251 |
|
| 252 |
args.expect_auth_required = _parse_bool(args.expect_auth_required)
|
| 253 |
args.expect_auth_enabled = _parse_bool(args.expect_auth_enabled)
|
| 254 |
+
args.require_router_workbook_loaded = _parse_bool(args.require_router_workbook_loaded)
|
| 255 |
|
| 256 |
try:
|
| 257 |
report = _build_report(args)
|
frontend/scripts/run-hosted-smoke.sh
CHANGED
|
@@ -4,6 +4,8 @@ set -euo pipefail
|
|
| 4 |
|
| 5 |
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 6 |
FRONTEND_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
|
|
|
|
|
|
| 7 |
ENV_FILE="${E2E_ENV_FILE:-${FRONTEND_DIR}/.env.e2e}"
|
| 8 |
PROD_BASE_URL="${E2E_PROD_BASE_URL:-https://crazycrazypete-masters-four-tab-openai.hf.space}"
|
| 9 |
CANARY_BASE_URL="${E2E_CANARY_BASE_URL:-https://crazycrazypete-masters-four-tab-openai-canary.hf.space}"
|
|
@@ -12,6 +14,7 @@ PROD_POTS_WORKSPACE_EXPECTATION="${E2E_PROD_POTS_WORKSPACE_EXPECTATION:-project-
|
|
| 12 |
CANARY_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_POTS_WORKSPACE_EXPECTATION:-project-shell}"
|
| 13 |
CANARY_AB_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_AB_POTS_WORKSPACE_EXPECTATION:-project-shell}"
|
| 14 |
TARGETS="${E2E_SMOKE_TARGETS:-production canary canary-ab}"
|
|
|
|
| 15 |
|
| 16 |
if [[ ! -f "${ENV_FILE}" ]]; then
|
| 17 |
echo "Missing env file: ${ENV_FILE}" >&2
|
|
@@ -27,6 +30,12 @@ run_target() {
|
|
| 27 |
local pots_workspace_expectation="$3"
|
| 28 |
|
| 29 |
echo "==> Hosted smoke: ${label} (${base_url}) [pots-workspace=${pots_workspace_expectation}]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
E2E_BASE_URL="${base_url}" E2E_POTS_WORKSPACE_EXPECTATION="${pots_workspace_expectation}" npx playwright test \
|
| 31 |
e2e/auth.full-flow.spec.ts \
|
| 32 |
e2e/pots.provider-coverage.spec.ts \
|
|
|
|
| 4 |
|
| 5 |
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 6 |
FRONTEND_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
| 7 |
+
REPO_ROOT="$(cd "${FRONTEND_DIR}/.." && pwd)"
|
| 8 |
+
VALIDATOR="${REPO_ROOT}/backend/scripts/validate_hosted_runtime.py"
|
| 9 |
ENV_FILE="${E2E_ENV_FILE:-${FRONTEND_DIR}/.env.e2e}"
|
| 10 |
PROD_BASE_URL="${E2E_PROD_BASE_URL:-https://crazycrazypete-masters-four-tab-openai.hf.space}"
|
| 11 |
CANARY_BASE_URL="${E2E_CANARY_BASE_URL:-https://crazycrazypete-masters-four-tab-openai-canary.hf.space}"
|
|
|
|
| 14 |
CANARY_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_POTS_WORKSPACE_EXPECTATION:-project-shell}"
|
| 15 |
CANARY_AB_POTS_WORKSPACE_EXPECTATION="${E2E_CANARY_AB_POTS_WORKSPACE_EXPECTATION:-project-shell}"
|
| 16 |
TARGETS="${E2E_SMOKE_TARGETS:-production canary canary-ab}"
|
| 17 |
+
VALIDATE_TIMEOUT_S="${E2E_HOSTED_VALIDATE_TIMEOUT_S:-30}"
|
| 18 |
|
| 19 |
if [[ ! -f "${ENV_FILE}" ]]; then
|
| 20 |
echo "Missing env file: ${ENV_FILE}" >&2
|
|
|
|
| 30 |
local pots_workspace_expectation="$3"
|
| 31 |
|
| 32 |
echo "==> Hosted smoke: ${label} (${base_url}) [pots-workspace=${pots_workspace_expectation}]"
|
| 33 |
+
python3 "${VALIDATOR}" \
|
| 34 |
+
--base-url "${base_url}" \
|
| 35 |
+
--expect-auth-required true \
|
| 36 |
+
--expect-auth-enabled true \
|
| 37 |
+
--require-router-workbook-loaded true \
|
| 38 |
+
--timeout-s "${VALIDATE_TIMEOUT_S}"
|
| 39 |
E2E_BASE_URL="${base_url}" E2E_POTS_WORKSPACE_EXPECTATION="${pots_workspace_expectation}" npx playwright test \
|
| 40 |
e2e/auth.full-flow.spec.ts \
|
| 41 |
e2e/pots.provider-coverage.spec.ts \
|