news / _utils.py
mdAmin313's picture
Update _utils.py
5d9234c verified
import re
import json
import logging
from typing import Optional, Dict
from urllib.parse import urlparse
logger = logging.getLogger("fact_checker_utils")
def sanitize_text(text: Optional[str]) -> str:
"""Cleans up text by removing HTML tags and normalizing whitespace."""
if not text:
return ""
t = re.sub(r"<[^>]+>", " ", text)
t = re.sub(r"\s+", " ", t).strip()
return t
def extract_json_from_text(text: str) -> Optional[str]:
"""
Safely extracts a fenced JSON block (```json ... ```) or a standalone
JSON object ({...}) from a text string.
"""
if not text:
return None
# 1. Search for fenced block (```json ... ``` or ``` ... ```)
# The regex captures the content inside the braces within the backticks.
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.S | re.I)
if m:
return m.group(1)
# 2. Fallback to bracket counting for an unfenced JSON object
start = None
depth = 0
for i, ch in enumerate(text):
if ch == "{":
if start is None:
start = i
depth += 1
elif ch == "}":
if depth > 0:
depth -= 1
if depth == 0 and start is not None:
return text[start:i+1]
return None
def safe_parse_gemini_json(raw_text: str) -> Optional[dict]:
"""
Extracts JSON from Gemini's text response using extract_json_from_text
and safely parses it.
"""
jstr = extract_json_from_text(raw_text)
if not jstr:
return None
try:
return json.loads(jstr)
except Exception:
logger.warning("Failed to parse extracted JSON: %s", jstr[:200])
return None
def domain_from_url(url: str) -> str:
"""Extracts the base domain from a URL."""
try:
host = urlparse(url).hostname or ""
return host.lower().lstrip("www.")
except Exception:
return ""