"""일반 웹페이지 fetch + 마크다운 변환 툴.

Content-Type 분기:
- text/html → BeautifulSoup → markdownify
- application/pdf → pypdf로 페이지별 텍스트 추출 (arxiv·NASA TR 등 외부 PDF용)
"""
import io
import re
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from smolagents import tool


def _handle_pdf_url(content: bytes) -> str:
    """외부 PDF URL 본문을 페이지별 텍스트로 변환. attachments._handle_pdf와 동일 패턴."""
    try:
        from pypdf import PdfReader
        reader = PdfReader(io.BytesIO(content))
        parts = []
        for i, page in enumerate(reader.pages):
            try:
                txt = page.extract_text() or ""
            except Exception as pe:
                txt = f"(extraction failed: {pe})"
            parts.append(f"--- Page {i+1} ---\n{txt}")
        combined = "\n\n".join(parts)
        if len(combined) > 12000:
            combined = combined[:12000] + "\n...[truncated]"
        return f"[PDF, {len(reader.pages)} pages]\n{combined}"
    except Exception as e:
        return f"PDF parse error: {e}"


@tool
def visit_webpage(url: str) -> str:
    """Fetch a web page (HTML or PDF) and return its readable text (truncated to ~12k chars).

    HTML pages are converted to markdown. PDF URLs are parsed page-by-page via pypdf —
    useful for arxiv papers, NASA technical reports, and other linked PDF documents.

    Args:
        url: The full URL of the webpage or PDF to fetch.
    """
    try:
        # 일부 사이트(특히 위키미디어 외)가 빈 User-Agent를 차단하므로 헤더를 명시한다.
        headers = {"User-Agent": "Mozilla/5.0 (compatible; GAIA-Agent/1.0)"}
        resp = requests.get(url, headers=headers, timeout=20)
        resp.raise_for_status()
        content_type = resp.headers.get("Content-Type", "").lower()
        # PDF: pypdf로 텍스트 추출. arxiv 논문 등 GAIA에 자주 등장.
        if "application/pdf" in content_type or url.lower().endswith(".pdf"):
            return _handle_pdf_url(resp.content)
        # HTML: 기존 흐름.
        soup = BeautifulSoup(resp.text, "html.parser")
        # 본문과 무관한 노이즈 제거: 스크립트/스타일/noscript 블록.
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        markdown = md(str(soup))
        # markdownify가 종종 빈 줄을 줄줄이 만들어내므로 압축해서 토큰을 절약한다.
        markdown = re.sub(r"\n{3,}", "\n\n", markdown).strip()
        # LLM 컨텍스트 보호: 너무 큰 페이지는 잘라서 반환한다.
        if len(markdown) > 12000:
            markdown = markdown[:12000] + "\n...[truncated]"
        return markdown
    except Exception as e:
        return f"visit_webpage error: {e}"