import os
import re
import json
import asyncio
from urllib.parse import urljoin, urlparse

import httpx
import gradio as gr
from bs4 import BeautifulSoup


# --- Scraper core helpers ---

def _is_valid_url(url: str) -> bool:
    try:
        u = urlparse(url.strip())
        return u.scheme in {"http", "https"} and bool(u.netloc)
    except Exception:
        return False


def _clean_text(s: str) -> str:
    s = re.sub(r"\s+", " ", s or "").strip()
    return s


def _extract_main_text(html: str) -> str:
    """
    Lightweight "main text" extraction (no heavy ML deps):
    - remove script/style/nav/footer/header/aside
    - prefer <main> or <article>, otherwise body
    """
    soup = BeautifulSoup(html, "lxml")

    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    for selector in ["nav", "footer", "header", "aside"]:
        for tag in soup.select(selector):
            tag.decompose()

    container = soup.find("main") or soup.find("article") or soup.body or soup
    text = container.get_text(" ", strip=True)
    return _clean_text(text)


def _extract_title(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    if soup.title and soup.title.string:
        return _clean_text(soup.title.string)
    h1 = soup.find("h1")
    return _clean_text(h1.get_text(strip=True)) if h1 else ""


def _extract_links(base_url: str, html: str, limit: int = 50) -> list[dict]:
    soup = BeautifulSoup(html, "lxml")
    links = []
    seen = set()

    for a in soup.find_all("a", href=True):
        href = a.get("href", "").strip()
        if not href:
            continue
        abs_url = urljoin(base_url, href)
        abs_url = abs_url.split("#", 1)[0]

        if not _is_valid_url(abs_url):
            continue
        if abs_url in seen:
            continue

        seen.add(abs_url)
        links.append(
            {
                "url": abs_url,
                "text": _clean_text(a.get_text(" ", strip=True))[:200],
            }
        )
        if len(links) >= limit:
            break

    return links


def _safe_truncate(s: str, max_chars: int) -> str:
    if len(s) <= max_chars:
        return s
    return s[: max_chars - 3] + "..."


# --- MCP-exposed tool functions (type hints + docstrings help MCP clients) ---

def scrape_url(
    url: str,
    *,
    mode: str = "text",
    timeout_s: int = 20,
    max_chars: int = 12000,
    follow_redirects: bool = True,
    user_agent: str = "Mozilla/5.0 (compatible; GradioMCPUrlScraper/1.0)",
) -> dict:
    """
    Fetch and scrape a URL.

    Parameters:
        url: The http(s) URL to fetch.
        mode: One of:
            - "text": returns title + extracted main text
            - "html": returns raw HTML (truncated)
            - "links": returns list of outgoing links (url + anchor text)
            - "all": returns title + text + links + html (truncated)
        timeout_s: Request timeout in seconds.
        max_chars: Maximum characters returned for large fields.
        follow_redirects: Whether to follow redirects.
        user_agent: Custom User-Agent header.

    Returns:
        A JSON-serializable dict with fields depending on mode.
    """
    url = (url or "").strip()
    if not _is_valid_url(url):
        return {"ok": False, "error": "Invalid URL. Must start with http:// or https://", "url": url}

    headers = {"User-Agent": user_agent}
    try:
        with httpx.Client(headers=headers, timeout=timeout_s, follow_redirects=follow_redirects) as client:
            r = client.get(url)
            content_type = (r.headers.get("content-type") or "").lower()
            html = r.text if "text" in content_type or "html" in content_type or not content_type else r.text

        out: dict = {
            "ok": True,
            "url": str(r.url),
            "status_code": r.status_code,
            "content_type": content_type,
        }

        # Always compute title if HTML-ish
        title = _extract_title(html)
        if title:
            out["title"] = title

        mode = (mode or "text").strip().lower()
        if mode not in {"text", "html", "links", "all"}:
            return {"ok": False, "error": f"Invalid mode '{mode}'. Use text|html|links|all.", "url": url}

        if mode in {"text", "all"}:
            text = _extract_main_text(html)
            out["text"] = _safe_truncate(text, max_chars)

        if mode in {"links", "all"}:
            out["links"] = _extract_links(str(r.url), html, limit=50)

        if mode in {"html", "all"}:
            out["html"] = _safe_truncate(html, max_chars)

        return out

    except httpx.HTTPError as e:
        return {"ok": False, "error": f"HTTP error: {type(e).__name__}: {str(e)}", "url": url}
    except Exception as e:
        return {"ok": False, "error": f"Unexpected error: {type(e).__name__}: {str(e)}", "url": url}


def scrape_many(urls_json: str, mode: str = "text") -> list[dict]:
    """
    Scrape multiple URLs in one call.

    Parameters:
        urls_json: JSON array of URLs, e.g. ["https://example.com", "https://example.org"]
        mode: text|html|links|all

    Returns:
        List of scrape_url() results.
    """
    try:
        urls = json.loads(urls_json)
        if not isinstance(urls, list):
            raise ValueError("urls_json must be a JSON array")
    except Exception as e:
        return [{"ok": False, "error": f"Invalid JSON array: {str(e)}", "url": ""}]

    results = []
    for u in urls[:25]:  # prevent abuse
        results.append(scrape_url(str(u), mode=mode))
    return results


# --- Gradio UI ---

with gr.Blocks(title="MCP URL Scraper") as demo:
    gr.Markdown(
        """
# MCP URL Scraper (Gradio + Hugging Face Spaces)
- Use the UI to scrape a single URL
- Or connect as an MCP server (tools: `scrape_url`, `scrape_many`)
"""
    )

    with gr.Row():
        url_in = gr.Textbox(label="URL", placeholder="https://example.com", scale=3)
        mode_in = gr.Dropdown(["text", "links", "html", "all"], value="text", label="Mode", scale=1)

    with gr.Row():
        timeout_in = gr.Slider(5, 60, value=20, step=1, label="Timeout (s)")
        maxchars_in = gr.Slider(1000, 50000, value=12000, step=1000, label="Max chars returned")

    run_btn = gr.Button("Scrape")
    out_json = gr.JSON(label="Result")

    run_btn.click(
        fn=scrape_url,
        inputs=[url_in, mode_in, timeout_in, maxchars_in],
        outputs=[out_json],
    )

if __name__ == "__main__":
    # Helps avoid some SSR-related weirdness on Spaces; users have reported ssr_mode=False as a workaround. :contentReference[oaicite:2]{index=2}
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.getenv("PORT", "7860")),
        ssr_mode=False,
        mcp_server=True,  # this enables the MCP endpoints :contentReference[oaicite:3]{index=3}
    )