Spaces:

rohitsar567
/

InsuranceBot

Sleeping

App Files Files Community

InsuranceBot / tools /info_source_map.py

rohitsar567

refactor: KI-050 — complete data/ → 40-data/ rename across all Python refs

52c6351 about 2 months ago

Raw

History Blame Contribute Delete

27.5 kB

	"""
	info_source_map.py — 100% link-integrity + claim-to-source two-part audit.

	Walks every claim with provenance triple {value, source_pdf_path\|source_url,
	source_quote} across:

	1. 40-data/policy_facts/*.json (per-policy curated facts; ~102 files)
	2. 40-data/reviews/*.json (per-insurer claim metrics + aggregator URLs)
	3. 40-data/premiums/illustrative_premiums.json (premium samples with source_url)

	For every (policy_id / insurer_slug, field, value, source) triple it runs:

	PART 1 — URL or local path resolves
	* source_pdf_path → file exists on disk
	* source_url → in tools/browser_verified.json allowlist
	OR httpx HEAD returns 2xx/3xx

	PART 2 — Source content backs the claim
	* source_pdf_path → open the PDF with pdfplumber, search for
	source_quote (case-insensitive, whitespace-normalised,
	first ~100 chars used as needle)
	* source_url → fetch first 50KB and grep for the same needle

	Verdicts (per claim):
	✅ verified — Part 1 + Part 2 both pass
	⚠️ url-ok-quote-missing — Part 1 passes, Part 2 fails
	❌ url-broken — Part 1 fails (file missing / URL broken)
	⏳ no-source-data — Field has a value but no source at all

	Output:
	- eval/info_source_map.json (machine-readable; ~one row per claim)
	- 40-data/information_source_map.md (human-readable audit report)
	"""

	from __future__ import annotations

	import argparse
	import json
	import re
	import sys
	import time
	from collections import Counter, defaultdict
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from pathlib import Path
	from typing import Any

	import httpx
	import pdfplumber

	ROOT = Path(__file__).resolve().parent.parent
	POLICY_FACTS_DIR = ROOT / "40-data" / "policy_facts"
	REVIEWS_DIR = ROOT / "40-data" / "reviews"
	PREMIUMS_FILE = ROOT / "40-data" / "premiums" / "illustrative_premiums.json"
	BROWSER_VERIFIED = ROOT / "tools" / "browser_verified.json"
	JSON_OUT = ROOT / "eval" / "info_source_map.json"
	MD_OUT = ROOT / "40-data" / "information_source_map.md"

	PDF_TEXT_CACHE: dict[str, str] = {}
	URL_TEXT_CACHE: dict[str, str] = {}
	URL_STATUS_CACHE: dict[str, int \| None] = {}

	NEEDLE_LEN = 60 # length of substring used as a search needle
	MIN_NEEDLE_LEN = 12 # minimum useful needle
	URL_FETCH_MAX = 50 * 1024
	URL_TIMEOUT = 8.0
	HEADERS = {
	"User-Agent": (
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
	),
	"Accept": "/",
	}

	# A small set of phrases that signal "no claim was made" — we treat these as
	# administrative notes, not real claims to verify.
	SENTINEL_PHRASES = {
	"not extracted",
	"not found",
	"not specified",
	"not enumerated",
	"not explicitly stated",
	"not extracted in this curation pass",
	"not extracted in this pass",
	"insurer-level metric",
	"presumed excluded",
	"presumed",
	"needs re-curation",
	}


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def normalise(text: str) -> str:
	"""Lower-case, collapse whitespace, strip punctuation noise for matching."""
	if text is None:
	return ""
	text = text.lower()
	text = re.sub(r"\s+", " ", text)
	# Replace common smart-quotes / dashes that PDF extraction may differ on
	text = text.replace("–", "-").replace("—", "-")
	text = text.replace("‘", "'").replace("’", "'")
	text = text.replace("“", '"').replace("”", '"')
	text = text.replace(" ", " ")
	return text.strip()


	def best_needle(quote: str) -> str:
	"""Pick the longest contiguous alphanumeric-rich substring of the quote
	so that we don't anchor on a single common word."""
	q = normalise(quote)
	if len(q) <= NEEDLE_LEN:
	return q
	# Try to pick a window that contains digits / capitalised tokens
	best = q[:NEEDLE_LEN]
	for i in range(0, len(q) - NEEDLE_LEN, 20):
	window = q[i:i + NEEDLE_LEN]
	if any(c.isdigit() for c in window) and any(c.isalpha() for c in window):
	best = window
	break
	return best


	def is_sentinel_quote(quote: str \| None) -> bool:
	if not quote:
	return False
	n = normalise(quote)
	return any(p in n for p in SENTINEL_PHRASES)


	def load_pdf_text(rel_path: str) -> str \| None:
	"""Return cached extracted text of a PDF (path is relative to project root)."""
	if rel_path in PDF_TEXT_CACHE:
	return PDF_TEXT_CACHE[rel_path]
	abs_path = (ROOT / rel_path).resolve()
	if not abs_path.exists():
	PDF_TEXT_CACHE[rel_path] = ""
	return None
	try:
	with pdfplumber.open(abs_path) as pdf:
	parts = []
	for page in pdf.pages:
	t = page.extract_text() or ""
	parts.append(t)
	full = normalise("\n".join(parts))
	PDF_TEXT_CACHE[rel_path] = full
	return full
	except Exception as exc: # noqa: BLE001
	print(f" [pdf-error] {rel_path}: {exc}", file=sys.stderr)
	PDF_TEXT_CACHE[rel_path] = ""
	return ""


	def fetch_url_head(url: str) -> int \| None:
	"""HEAD request — returns status code or None on error."""
	if url in URL_STATUS_CACHE:
	return URL_STATUS_CACHE[url]
	try:
	with httpx.Client(follow_redirects=True, timeout=URL_TIMEOUT,
	headers=HEADERS) as client:
	r = client.head(url)
	URL_STATUS_CACHE[url] = r.status_code
	if r.status_code in (405, 403): # some servers refuse HEAD
	r2 = client.get(url, headers={**HEADERS, "Range": "bytes=0-1024"})
	URL_STATUS_CACHE[url] = r2.status_code
	return URL_STATUS_CACHE[url]
	except Exception: # noqa: BLE001
	URL_STATUS_CACHE[url] = None
	return None


	def fetch_url_text(url: str) -> str:
	"""Return first 50KB of URL body, normalised — cached. Empty on failure."""
	if url in URL_TEXT_CACHE:
	return URL_TEXT_CACHE[url]
	try:
	with httpx.Client(follow_redirects=True, timeout=URL_TIMEOUT,
	headers=HEADERS) as client:
	r = client.get(url, headers={**HEADERS,
	"Range": f"bytes=0-{URL_FETCH_MAX}"})
	if r.status_code >= 400:
	URL_TEXT_CACHE[url] = ""
	return ""
	text = r.text[:URL_FETCH_MAX]
	URL_TEXT_CACHE[url] = normalise(text)
	return URL_TEXT_CACHE[url]
	except Exception: # noqa: BLE001
	URL_TEXT_CACHE[url] = ""
	return ""


	def quote_found_in(haystack: str, quote: str) -> bool:
	"""Try increasingly forgiving substring matches; return True if quote
	can be located in haystack."""
	if not haystack or not quote:
	return False
	needle = best_needle(quote)
	if len(needle) < MIN_NEEDLE_LEN:
	return False
	if needle in haystack:
	return True
	# Try first 30 chars
	short = needle[:30]
	if len(short) >= MIN_NEEDLE_LEN and short in haystack:
	return True
	# Try a digit/code anchor: any token of >=10 chars that contains a digit
	tokens = re.findall(r"[a-z0-9][a-z0-9\-./]{8,}", needle)
	for tok in tokens:
	if any(c.isdigit() for c in tok) and tok in haystack:
	return True
	return False


	# ---------------------------------------------------------------------------
	# Allowlist
	# ---------------------------------------------------------------------------

	def load_allowlist() -> set[str]:
	if not BROWSER_VERIFIED.exists():
	return set()
	data = json.loads(BROWSER_VERIFIED.read_text())
	return set(data.keys())


	# ---------------------------------------------------------------------------
	# Auditors
	# ---------------------------------------------------------------------------

	def audit_provenance_triple(
	*,
	record_id: str,
	field: str,
	value: Any,
	source_pdf_path: str \| None,
	source_url: str \| None,
	source_quote: str \| None,
	allowlist: set[str],
	check_url_live: bool,
	) -> dict:
	"""Return audit row for a single claim."""
	row = {
	"record_id": record_id,
	"field": field,
	"value": value,
	"source_pdf_path": source_pdf_path,
	"source_url": source_url,
	"source_quote": source_quote,
	"part1_resolves": False,
	"part2_quote_found": False,
	"verdict": "❌ url-broken",
	"notes": "",
	}

	# If value is None or sentinel quote, there's no claim being made — skip.
	if value is None or value == "" or value == [] or value == {}:
	row["verdict"] = "⏳ no-claim"
	row["notes"] = "value is null / empty — no claim to verify"
	return row

	if is_sentinel_quote(source_quote):
	row["verdict"] = "⏳ no-claim"
	row["notes"] = "source_quote is a 'not extracted' sentinel — administrative note, not a claim"
	return row

	# Need at least one source to audit.
	if not source_pdf_path and not source_url:
	row["verdict"] = "⏳ no-source-data"
	row["notes"] = "value populated but no source_pdf_path or source_url provided"
	return row

	# PART 1 + PART 2 via PDF path
	if source_pdf_path:
	abs_pdf = (ROOT / source_pdf_path).resolve()
	if abs_pdf.exists():
	row["part1_resolves"] = True
	text = load_pdf_text(source_pdf_path) or ""
	if source_quote and quote_found_in(text, source_quote):
	row["part2_quote_found"] = True
	row["verdict"] = "✅ verified"
	else:
	row["verdict"] = "⚠️ url-ok-quote-missing"
	row["notes"] = "PDF exists but source_quote not found in extracted text"
	else:
	row["verdict"] = "❌ url-broken"
	row["notes"] = f"PDF not found at {source_pdf_path}"
	return row

	# PART 1 + PART 2 via URL
	if source_url:
	if source_url in allowlist:
	row["part1_resolves"] = True
	row["notes"] = "URL in browser_verified allowlist"
	elif check_url_live:
	status = fetch_url_head(source_url)
	if status is not None and 200 <= status < 400:
	row["part1_resolves"] = True
	else:
	row["verdict"] = "❌ url-broken"
	row["notes"] = f"HEAD returned status={status}"
	return row
	else:
	# Defer URL liveness check, mark as ok (allowlist-only mode)
	row["part1_resolves"] = True
	row["notes"] = "URL liveness skipped (--allowlist-only)"

	# Part 2 — fetch content
	if source_quote and check_url_live:
	text = fetch_url_text(source_url)
	if text and quote_found_in(text, source_quote):
	row["part2_quote_found"] = True
	row["verdict"] = "✅ verified"
	else:
	row["verdict"] = "⚠️ url-ok-quote-missing"
	if not row["notes"]:
	row["notes"] = "URL reachable but quote not found in fetched body"
	else:
	row["notes"] += " \| quote not found in fetched body"
	elif not source_quote:
	row["verdict"] = "⚠️ url-ok-quote-missing"
	row["notes"] = "URL reachable but no source_quote provided"
	else:
	# check_url_live False but we got past Part 1
	row["verdict"] = "⚠️ url-ok-quote-missing"
	row["notes"] = "quote not verified (allowlist-only mode)"
	return row

	return row


	# ---------------------------------------------------------------------------
	# Walkers
	# ---------------------------------------------------------------------------

	def walk_policy_facts(allowlist: set[str], check_url_live: bool) -> list[dict]:
	rows = []
	files = sorted(POLICY_FACTS_DIR.glob("*.json"))
	for i, f in enumerate(files, 1):
	if f.name.startswith("_"):
	continue
	data = json.loads(f.read_text())
	policy_id = data.get("policy_id", f.stem)
	print(f" [{i:>3}/{len(files)}] policy_facts: {policy_id}", file=sys.stderr)
	for field, obj in data.items():
	if field in ("policy_id", "policy_name", "insurer_slug", "_meta"):
	continue
	if not isinstance(obj, dict):
	continue
	row = audit_provenance_triple(
	record_id=policy_id,
	field=field,
	value=obj.get("value"),
	source_pdf_path=obj.get("source_pdf_path"),
	source_url=obj.get("source_url"),
	source_quote=obj.get("source_quote"),
	allowlist=allowlist,
	check_url_live=check_url_live,
	)
	row["category"] = "policy_facts"
	row["source_file"] = str(f.relative_to(ROOT))
	rows.append(row)
	return rows


	def walk_reviews(allowlist: set[str], check_url_live: bool) -> list[dict]:
	"""Audit every URL inside per-insurer reviews JSONs."""
	rows = []
	files = sorted(REVIEWS_DIR.glob("*.json"))
	for i, f in enumerate(files, 1):
	if f.name.startswith("_") or f.name.lower() == "index.md":
	continue
	data = json.loads(f.read_text())
	slug = data.get("insurer_slug", f.stem)
	print(f" [{i:>3}/{len(files)}] reviews: {slug}", file=sys.stderr)

	# claim_metrics block — three URLs
	cm = data.get("claim_metrics", {}) or {}
	for url_field in ("source_irdai_url", "source_secondary_url", "source_company_url"):
	url = cm.get(url_field)
	if not url:
	continue
	# Source quote for these = the numeric values they support
	csr = cm.get("claim_settlement_ratio_pct")
	quote = f"{csr}" if csr is not None else None
	row = audit_provenance_triple(
	record_id=slug,
	field=f"claim_metrics.{url_field}",
	value=csr,
	source_pdf_path=None,
	source_url=url,
	source_quote=quote,
	allowlist=allowlist,
	check_url_live=check_url_live,
	)
	row["category"] = "reviews"
	row["source_file"] = str(f.relative_to(ROOT))
	rows.append(row)

	# aggregator_ratings — policybazaar / insuredekho / joinditto each have url
	for agg_name, agg in (data.get("aggregator_ratings") or {}).items():
	if not isinstance(agg, dict):
	continue
	url = agg.get("url")
	star = agg.get("avg_star")
	if not url:
	continue
	row = audit_provenance_triple(
	record_id=slug,
	field=f"aggregator_ratings.{agg_name}",
	value=star,
	source_pdf_path=None,
	source_url=url,
	source_quote=None, # rating pages rarely surface text-quotable evidence
	allowlist=allowlist,
	check_url_live=check_url_live,
	)
	row["category"] = "reviews"
	row["source_file"] = str(f.relative_to(ROOT))
	rows.append(row)

	# trustpilot.url
	tp = data.get("trustpilot") or {}
	if tp.get("url"):
	row = audit_provenance_triple(
	record_id=slug,
	field="trustpilot.url",
	value=tp.get("score"),
	source_pdf_path=None,
	source_url=tp.get("url"),
	source_quote=None,
	allowlist=allowlist,
	check_url_live=check_url_live,
	)
	row["category"] = "reviews"
	row["source_file"] = str(f.relative_to(ROOT))
	rows.append(row)

	# reddit_sentiment.sample_post_urls
	rs = data.get("reddit_sentiment") or {}
	for j, url in enumerate(rs.get("sample_post_urls") or []):
	row = audit_provenance_triple(
	record_id=slug,
	field=f"reddit.sample_post_urls[{j}]",
	value=url,
	source_pdf_path=None,
	source_url=url,
	source_quote=None,
	allowlist=allowlist,
	check_url_live=check_url_live,
	)
	row["category"] = "reviews"
	row["source_file"] = str(f.relative_to(ROOT))
	rows.append(row)

	# youtube_coverage.top_creators_who_reviewed[].video_url
	yc = data.get("youtube_coverage") or {}
	for j, vid in enumerate(yc.get("top_creators_who_reviewed") or []):
	url = vid.get("video_url")
	if not url:
	continue
	row = audit_provenance_triple(
	record_id=slug,
	field=f"youtube[{j}].{vid.get('creator', '?')}",
	value=vid.get("video_title"),
	source_pdf_path=None,
	source_url=url,
	source_quote=None,
	allowlist=allowlist,
	check_url_live=check_url_live,
	)
	row["category"] = "reviews"
	row["source_file"] = str(f.relative_to(ROOT))
	rows.append(row)

	# in_news[].url
	for j, news in enumerate(data.get("in_news") or []):
	url = news.get("url")
	if not url:
	continue
	row = audit_provenance_triple(
	record_id=slug,
	field=f"in_news[{j}]",
	value=news.get("headline"),
	source_pdf_path=None,
	source_url=url,
	source_quote=None,
	allowlist=allowlist,
	check_url_live=check_url_live,
	)
	row["category"] = "reviews"
	row["source_file"] = str(f.relative_to(ROOT))
	rows.append(row)
	return rows


	def walk_premiums(allowlist: set[str], check_url_live: bool) -> list[dict]:
	rows = []
	if not PREMIUMS_FILE.exists():
	return rows
	data = json.loads(PREMIUMS_FILE.read_text())
	print(f" premiums: {PREMIUMS_FILE.name}", file=sys.stderr)

	# sources_consulted at the top level
	for j, url in enumerate(data.get("sources_consulted") or []):
	row = audit_provenance_triple(
	record_id="premiums_meta",
	field=f"sources_consulted[{j}]",
	value=url,
	source_pdf_path=None,
	source_url=url,
	source_quote=None,
	allowlist=allowlist,
	check_url_live=check_url_live,
	)
	row["category"] = "premiums"
	row["source_file"] = str(PREMIUMS_FILE.relative_to(ROOT))
	rows.append(row)

	# per-policy base_premiums
	for policy_id, blk in (data.get("base_premiums") or {}).items():
	for j, sample in enumerate(blk.get("samples") or []):
	url = sample.get("source_url")
	if not url or url == "derived_from_anchor":
	# derived samples are explicitly labelled — not a claim against an external source
	continue
	row = audit_provenance_triple(
	record_id=policy_id,
	field=f"samples[{j}].age={sample.get('age')}_si={sample.get('sum_insured_inr')}",
	value=sample.get("annual_premium_inr"),
	source_pdf_path=None,
	source_url=url,
	source_quote=None,
	allowlist=allowlist,
	check_url_live=check_url_live,
	)
	row["category"] = "premiums"
	row["source_file"] = str(PREMIUMS_FILE.relative_to(ROOT))
	rows.append(row)
	return rows


	# ---------------------------------------------------------------------------
	# Report
	# ---------------------------------------------------------------------------

	def render_markdown(rows: list[dict], summary_meta: dict) -> str:
	counts_overall = Counter(r["verdict"] for r in rows)
	counts_by_cat: dict[str, Counter] = defaultdict(Counter)
	for r in rows:
	counts_by_cat[r["category"]][r["verdict"]] += 1

	lines = []
	lines.append("# Insurance Sales Bot — Information Source Map")
	lines.append("")
	lines.append(f"Generated: {summary_meta['generated_at']}")
	lines.append(f"Total claims audited: {len(rows)}")
	lines.append("")
	lines.append("## Verdict Summary")
	lines.append("")
	lines.append("\| Category \| ✅ verified \| ⚠️ url-ok-quote-missing \| ❌ url-broken \| ⏳ no-claim / no-source \|")
	lines.append("\|---\|---:\|---:\|---:\|---:\|")
	cats = sorted(counts_by_cat.keys())
	for cat in cats:
	c = counts_by_cat[cat]
	no_claim = c.get("⏳ no-claim", 0) + c.get("⏳ no-source-data", 0)
	lines.append(
	f"\| {cat} \| {c.get('✅ verified', 0)} \| "
	f"{c.get('⚠️ url-ok-quote-missing', 0)} \| "
	f"{c.get('❌ url-broken', 0)} \| "
	f"{no_claim} \|"
	)
	total_no_claim = counts_overall.get("⏳ no-claim", 0) + counts_overall.get("⏳ no-source-data", 0)
	lines.append(
	f"\| TOTAL \| {counts_overall.get('✅ verified', 0)} \| "
	f"{counts_overall.get('⚠️ url-ok-quote-missing', 0)} \| "
	f"{counts_overall.get('❌ url-broken', 0)} \| "
	f"{total_no_claim} \|"
	)
	lines.append("")

	# Must-Fix section
	broken = [r for r in rows if r["verdict"] == "❌ url-broken"]
	lines.append(f"## Must Fix — {len(broken)} broken source(s)")
	lines.append("")
	if not broken:
	lines.append("_None — all sources resolve._")
	else:
	lines.append("\| Record \| Field \| Value \| Source \| Notes \|")
	lines.append("\|---\|---\|---\|---\|---\|")
	for r in broken:
	src = r["source_pdf_path"] or r["source_url"] or "—"
	val = str(r["value"])[:60]
	lines.append(
	f"\| `{r['record_id']}` \| `{r['field']}` \| {val} \| "
	f"`{src}` \| {r['notes']} \|"
	)
	lines.append("")

	# Per-category tables (compressed: only ⚠️ + ❌ shown)
	for cat in cats:
	lines.append(f"## {cat}")
	lines.append("")
	cat_rows = [r for r in rows if r["category"] == cat]
	flagged = [r for r in cat_rows if r["verdict"] in ("⚠️ url-ok-quote-missing", "❌ url-broken")]
	verified = sum(1 for r in cat_rows if r["verdict"] == "✅ verified")
	lines.append(f"Audited {len(cat_rows)} claims — ✅ {verified} verified, "
	f"⚠️ {sum(1 for r in cat_rows if r['verdict']=='⚠️ url-ok-quote-missing')} "
	f"quote-missing, ❌ {sum(1 for r in cat_rows if r['verdict']=='❌ url-broken')} broken.")
	lines.append("")
	if flagged:
	lines.append("### Flagged claims")
	lines.append("")
	lines.append("\| Record \| Field \| Verdict \| Source \| Notes \|")
	lines.append("\|---\|---\|---\|---\|---\|")
	for r in flagged[:200]: # cap to keep MD manageable
	src = r["source_pdf_path"] or r["source_url"] or "—"
	lines.append(
	f"\| `{r['record_id']}` \| `{r['field']}` \| {r['verdict']} \| "
	f"`{src}` \| {r['notes']} \|"
	)
	if len(flagged) > 200:
	lines.append(f"\n_... and {len(flagged) - 200} more rows truncated; see eval/info_source_map.json for full data._")
	lines.append("")

	# 100% verified insurers
	lines.append("## Insurers / Policies with 100% verified claims")
	lines.append("")
	per_record_counts: dict[str, Counter] = defaultdict(Counter)
	for r in rows:
	per_record_counts[r["record_id"]][r["verdict"]] += 1
	clean = []
	not_clean = []
	for record_id, c in sorted(per_record_counts.items()):
	verified = c.get("✅ verified", 0)
	broken = c.get("❌ url-broken", 0)
	quote_missing = c.get("⚠️ url-ok-quote-missing", 0)
	total_real = verified + broken + quote_missing
	if total_real == 0:
	continue # only no-claim rows
	if broken == 0 and quote_missing == 0:
	clean.append(record_id)
	else:
	not_clean.append((record_id, verified, quote_missing, broken))
	for r in clean:
	lines.append(f"- {r}")
	if not clean:
	lines.append("_None._")
	lines.append("")

	lines.append("## Records with remaining ⚠️ url-ok-quote-missing")
	lines.append("")
	if not_clean:
	lines.append("\| Record \| ✅ \| ⚠️ \| ❌ \|")
	lines.append("\|---\|---:\|---:\|---:\|")
	for record_id, v, q, b in not_clean:
	lines.append(f"\| {record_id} \| {v} \| {q} \| {b} \|")
	else:
	lines.append("_None._")
	lines.append("")

	# Final summary line
	lines.append("---")
	lines.append("")
	lines.append(f"**Audit complete: ✅ {counts_overall.get('✅ verified', 0)} / "
	f"⚠️ {counts_overall.get('⚠️ url-ok-quote-missing', 0)} / "
	f"❌ {counts_overall.get('❌ url-broken', 0)}**")
	return "\n".join(lines)


	# ---------------------------------------------------------------------------
	# Entry point
	# ---------------------------------------------------------------------------

	def main(argv=None):
	p = argparse.ArgumentParser(description=__doc__)
	p.add_argument("--allowlist-only", action="store_true",
	help="Skip live HTTP for URLs (rely only on browser_verified.json)")
	p.add_argument("--skip-urls", action="store_true",
	help="Skip URL audits entirely; only audit PDF-backed claims")
	p.add_argument("--quiet", action="store_true")
	args = p.parse_args(argv)

	allowlist = load_allowlist()
	print(f"Loaded {len(allowlist)} URLs in browser_verified allowlist.", file=sys.stderr)
	check_url_live = not args.allowlist_only

	t0 = time.time()
	rows: list[dict] = []
	rows.extend(walk_policy_facts(allowlist, check_url_live))
	if not args.skip_urls:
	rows.extend(walk_reviews(allowlist, check_url_live))
	rows.extend(walk_premiums(allowlist, check_url_live))
	elapsed = time.time() - t0

	summary_meta = {
	"generated_at": time.strftime("%Y-%m-%d %H:%M:%S %Z"),
	"elapsed_sec": round(elapsed, 1),
	"rows": len(rows),
	}

	JSON_OUT.parent.mkdir(parents=True, exist_ok=True)
	JSON_OUT.write_text(json.dumps(
	{"meta": summary_meta, "rows": rows}, indent=2, ensure_ascii=False))
	md = render_markdown(rows, summary_meta)
	MD_OUT.parent.mkdir(parents=True, exist_ok=True)
	MD_OUT.write_text(md)

	print(f"\nWrote {JSON_OUT.relative_to(ROOT)} ({len(rows)} rows)")
	print(f"Wrote {MD_OUT.relative_to(ROOT)}")
	print(f"Elapsed: {elapsed:.1f}s")

	# one-line verdict
	counts = Counter(r["verdict"] for r in rows)
	print(f"\nVerdicts: ✅ {counts.get('✅ verified',0)} \| "
	f"⚠️ {counts.get('⚠️ url-ok-quote-missing',0)} \| "
	f"❌ {counts.get('❌ url-broken',0)} \| "
	f"⏳ {counts.get('⏳ no-claim',0) + counts.get('⏳ no-source-data',0)}")


	if __name__ == "__main__":
	main()