Spaces:
Sleeping
Sleeping
| import re | |
| import json | |
| import logging | |
| from typing import Optional, Dict | |
| from urllib.parse import urlparse | |
| logger = logging.getLogger("fact_checker_utils") | |
| def sanitize_text(text: Optional[str]) -> str: | |
| """Cleans up text by removing HTML tags and normalizing whitespace.""" | |
| if not text: | |
| return "" | |
| t = re.sub(r"<[^>]+>", " ", text) | |
| t = re.sub(r"\s+", " ", t).strip() | |
| return t | |
| def extract_json_from_text(text: str) -> Optional[str]: | |
| """ | |
| Safely extracts a fenced JSON block (```json ... ```) or a standalone | |
| JSON object ({...}) from a text string. | |
| """ | |
| if not text: | |
| return None | |
| # 1. Search for fenced block (```json ... ``` or ``` ... ```) | |
| # The regex captures the content inside the braces within the backticks. | |
| m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.S | re.I) | |
| if m: | |
| return m.group(1) | |
| # 2. Fallback to bracket counting for an unfenced JSON object | |
| start = None | |
| depth = 0 | |
| for i, ch in enumerate(text): | |
| if ch == "{": | |
| if start is None: | |
| start = i | |
| depth += 1 | |
| elif ch == "}": | |
| if depth > 0: | |
| depth -= 1 | |
| if depth == 0 and start is not None: | |
| return text[start:i+1] | |
| return None | |
| def safe_parse_gemini_json(raw_text: str) -> Optional[dict]: | |
| """ | |
| Extracts JSON from Gemini's text response using extract_json_from_text | |
| and safely parses it. | |
| """ | |
| jstr = extract_json_from_text(raw_text) | |
| if not jstr: | |
| return None | |
| try: | |
| return json.loads(jstr) | |
| except Exception: | |
| logger.warning("Failed to parse extracted JSON: %s", jstr[:200]) | |
| return None | |
| def domain_from_url(url: str) -> str: | |
| """Extracts the base domain from a URL.""" | |
| try: | |
| host = urlparse(url).hostname or "" | |
| return host.lower().lstrip("www.") | |
| except Exception: | |
| return "" |