"""Firecrawl API client for scraping and crawling official web pages."""

from __future__ import annotations

import os
from functools import lru_cache
from typing import Any

from firecrawl import Firecrawl
from firecrawl.v2.types import ScrapeOptions

SCRAPE_MARKDOWN_LIMIT = 8000
CRAWL_PAGE_MARKDOWN_LIMIT = 4000


def _truncate(text: str | None, limit: int) -> str:
    if not text:
        return ""
    if len(text) <= limit:
        return text
    return text[:limit] + "\n… (truncated)"


@lru_cache(maxsize=1)
def _client() -> Firecrawl:
    api_key = os.environ.get("FIRECRAWL_API_KEY")
    if not api_key:
        raise RuntimeError("FIRECRAWL_API_KEY is not set")
    return Firecrawl(api_key=api_key)


def _location(country: str | None) -> dict[str, Any] | None:
    if not country:
        return None
    normalized = country.strip().upper()
    if not normalized:
        return None
    return {"country": normalized, "languages": ["en"]}


def _metadata_dict(metadata: Any) -> dict[str, Any]:
    if metadata is None:
        return {}
    if hasattr(metadata, "model_dump"):
        return metadata.model_dump()
    if isinstance(metadata, dict):
        return metadata
    return {}


def _page_from_document(doc: Any) -> dict[str, Any]:
    metadata = _metadata_dict(getattr(doc, "metadata", None))
    source_url = metadata.get("source_url") or metadata.get("url") or metadata.get("og_url")
    return {
        "url": source_url or "",
        "title": metadata.get("title") or "",
        "markdown": _truncate(getattr(doc, "markdown", None), CRAWL_PAGE_MARKDOWN_LIMIT),
        "source_url": source_url or "",
        "status_code": metadata.get("status_code"),
        "description": metadata.get("description") or "",
    }


def scrape_page(
    url: str,
    *,
    only_main_content: bool = True,
    country: str | None = None,
    languages: list[str] | None = None,
    max_age: int | None = None,
    timeout: int = 60,
) -> dict[str, Any]:
    """
    Scrape a single URL and return normalized markdown content.

    Args:
        url: Full HTTPS URL to scrape.
        only_main_content: Drop navigation, footer, and ads when possible.
        country: Optional ISO-2 country code for geo-targeted content.
        languages: Optional preferred languages; defaults from country or English.
        max_age: Cache freshness window in milliseconds; ``None`` uses SDK default.
        timeout: Request timeout in seconds.
    """
    normalized_url = (url or "").strip()
    if not normalized_url:
        return {"error": "url is required", "url": url}

    try:
        location = _location(country)
        if location and languages:
            location["languages"] = languages

        kwargs: dict[str, Any] = {
            "formats": ["markdown"],
            "only_main_content": only_main_content,
            "timeout": timeout * 1000,
        }
        if location:
            kwargs["location"] = location
        if max_age is not None:
            kwargs["max_age"] = max_age

        doc = _client().scrape(normalized_url, **kwargs)
        page = _page_from_document(doc)
        page["url"] = normalized_url
        page["markdown"] = _truncate(page["markdown"], SCRAPE_MARKDOWN_LIMIT)
        return page
    except Exception as exc:
        return {"error": str(exc), "url": normalized_url}


def crawl_site(
    url: str,
    *,
    limit: int = 10,
    max_depth: int = 2,
    include_paths: list[str] | None = None,
    exclude_paths: list[str] | None = None,
    country: str | None = None,
    poll_interval: int = 2,
    timeout: int = 120,
) -> dict[str, Any]:
    """
    Crawl a website starting from a URL and return normalized page summaries.

    Args:
        url: Starting URL for the crawl.
        limit: Maximum number of pages to crawl.
        max_depth: Maximum discovery depth from the start URL.
        include_paths: Optional path prefixes to include.
        exclude_paths: Optional path prefixes to exclude.
        country: Optional ISO-2 country code for geo-targeted content.
        poll_interval: Seconds between crawl status polls.
        timeout: Maximum seconds to wait for crawl completion.
    """
    normalized_url = (url or "").strip()
    if not normalized_url:
        return {"error": "url is required", "url": url}

    try:
        location = _location(country)
        scrape_options = ScrapeOptions(
            formats=["markdown"],
            only_main_content=True,
            location=location,
        )

        kwargs: dict[str, Any] = {
            "limit": limit,
            "max_discovery_depth": max_depth,
            "scrape_options": scrape_options,
            "poll_interval": poll_interval,
            "timeout": timeout,
        }
        if include_paths:
            kwargs["include_paths"] = include_paths
        if exclude_paths:
            kwargs["exclude_paths"] = exclude_paths

        job = _client().crawl(normalized_url, **kwargs)
        pages = [_page_from_document(doc) for doc in (job.data or [])]
        return {
            "start_url": normalized_url,
            "pages_found": len(pages),
            "status": getattr(job, "status", None),
            "pages": pages,
        }
    except Exception as exc:
        return {"error": str(exc), "start_url": normalized_url}