import re
import json
import logging
from typing import Optional, Dict
from urllib.parse import urlparse

logger = logging.getLogger("fact_checker_utils")

def sanitize_text(text: Optional[str]) -> str:
    """Cleans up text by removing HTML tags and normalizing whitespace."""
    if not text:
        return ""
    t = re.sub(r"<[^>]+>", " ", text)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def extract_json_from_text(text: str) -> Optional[str]:
    """
    Safely extracts a fenced JSON block (```json ... ```) or a standalone 
    JSON object ({...}) from a text string.
    """
    if not text:
        return None
    # 1. Search for fenced block (```json ... ``` or ``` ... ```)
    # The regex captures the content inside the braces within the backticks.
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.S | re.I)
    if m:
        return m.group(1)
    
    # 2. Fallback to bracket counting for an unfenced JSON object
    start = None
    depth = 0
    for i, ch in enumerate(text):
        if ch == "{":
            if start is None:
                start = i
            depth += 1
        elif ch == "}":
            if depth > 0:
                depth -= 1
                if depth == 0 and start is not None:
                    return text[start:i+1]
    return None

def safe_parse_gemini_json(raw_text: str) -> Optional[dict]:
    """
    Extracts JSON from Gemini's text response using extract_json_from_text 
    and safely parses it.
    """
    jstr = extract_json_from_text(raw_text)
    if not jstr:
        return None
    try:
        return json.loads(jstr)
    except Exception:
        logger.warning("Failed to parse extracted JSON: %s", jstr[:200])
        return None

def domain_from_url(url: str) -> str:
    """Extracts the base domain from a URL."""
    try:
        host = urlparse(url).hostname or ""
        return host.lower().lstrip("www.")
    except Exception:
        return ""