import re import json import logging from typing import Optional, Dict from urllib.parse import urlparse logger = logging.getLogger("fact_checker_utils") def sanitize_text(text: Optional[str]) -> str: """Cleans up text by removing HTML tags and normalizing whitespace.""" if not text: return "" t = re.sub(r"<[^>]+>", " ", text) t = re.sub(r"\s+", " ", t).strip() return t def extract_json_from_text(text: str) -> Optional[str]: """ Safely extracts a fenced JSON block (```json ... ```) or a standalone JSON object ({...}) from a text string. """ if not text: return None # 1. Search for fenced block (```json ... ``` or ``` ... ```) # The regex captures the content inside the braces within the backticks. m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.S | re.I) if m: return m.group(1) # 2. Fallback to bracket counting for an unfenced JSON object start = None depth = 0 for i, ch in enumerate(text): if ch == "{": if start is None: start = i depth += 1 elif ch == "}": if depth > 0: depth -= 1 if depth == 0 and start is not None: return text[start:i+1] return None def safe_parse_gemini_json(raw_text: str) -> Optional[dict]: """ Extracts JSON from Gemini's text response using extract_json_from_text and safely parses it. """ jstr = extract_json_from_text(raw_text) if not jstr: return None try: return json.loads(jstr) except Exception: logger.warning("Failed to parse extracted JSON: %s", jstr[:200]) return None def domain_from_url(url: str) -> str: """Extracts the base domain from a URL.""" try: host = urlparse(url).hostname or "" return host.lower().lstrip("www.") except Exception: return ""