# ── FILE: requirements.txt ────────────────────────────────────────────────
# Added: openai (for HF router OpenAI-compatible client)

flask==3.1.0
python-dotenv==1.0.1
langgraph==0.2.55
langchain==0.3.7
langchain-huggingface==0.1.2
langchain-core==0.3.21
langchain-community==0.3.7
huggingface-hub==0.26.2
sentence-transformers==3.3.1
faiss-cpu==1.9.0
rank-bm25==0.2.2
pypdf==5.1.0
duckduckgo-search==6.3.7
numpy==1.26.4
gunicorn==23.0.0
werkzeug==3.1.3
beautifulsoup4==4.12.3
lxml==5.3.0
openai==1.59.0


# ── FILE: agents/llm_factory.py ───────────────────────────────────────────
# Uses OpenAI-compatible client pointed at router.huggingface.co/v1
# This is the officially documented method in HF docs as of 2026.

import os
from openai import OpenAI

# HF router OpenAI-compatible endpoint — officially documented
_HF_BASE_URL = "https://router.huggingface.co/v1"

AVAILABLE_MODELS = {
    "llama3-8b": {
        "id":          "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "label":       "Llama 3.1 8B (Meta)",
        "description": "Best balance of quality and speed. Most widely available on free-tier providers.",
        "speed":       "fast",
        "params":      "8B",
    },
    "qwen25-7b": {
        "id":          "Qwen/Qwen2.5-7B-Instruct",
        "label":       "Qwen 2.5 7B (Alibaba)",
        "description": "Strong multilingual reasoning. Excellent for structured output and document analysis.",
        "speed":       "fast",
        "params":      "7B",
    },
    "phi35-mini": {
        "id":          "microsoft/Phi-3.5-mini-instruct",
        "label":       "Phi-3.5 Mini (Microsoft)",
        "description": "3.8B params — fastest option. Good for simple Q&A and quick demos.",
        "speed":       "fast",
        "params":      "3.8B",
    },
    "mistral-7b": {
        "id":          "mistralai/Mistral-7B-Instruct-v0.3",
        "label":       "Mistral 7B v0.3",
        "description": "Strong instruction following. Available via Sambanova on free credits.",
        "speed":       "medium",
        "params":      "7B",
    },
    "gemma2-9b": {
        "id":          "google/gemma-2-9b-it",
        "label":       "Gemma 2 9B (Google)",
        "description": "Google's Gemma 2 instruction-tuned — strong factual grounding and reasoning.",
        "speed":       "medium",
        "params":      "9B",
    },
}

_current_model_key = "llama3-8b"


def get_current_model_key() -> str:
    return _current_model_key


def set_current_model(key: str):
    global _current_model_key
    if key not in AVAILABLE_MODELS:
        raise ValueError(f"Unknown model key '{key}'. Valid: {list(AVAILABLE_MODELS)}")
    _current_model_key = key


def get_current_model_id() -> str:
    return AVAILABLE_MODELS[_current_model_key]["id"]


def call_llm(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7) -> str:
    """Call the HF router using OpenAI-compatible API — the official 2026 method."""
    token = os.getenv("HF_TOKEN", "")
    if not token:
        raise EnvironmentError("HF_TOKEN is not set. Add your HuggingFace Read token in Space secrets or .env.")

    client   = OpenAI(base_url=_HF_BASE_URL, api_key=token)
    model_id = get_current_model_id()

    response = client.chat.completions.create(
        model=model_id,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_new_tokens,
        temperature=max(temperature, 0.01),
    )
    return response.choices[0].message.content.strip()


# ── FILE: rag/ingestor.py ─────────────────────────────────────────────────
# Changes:
#   1. Better browser-like headers to reduce 403s on public sites
#   2. Retry with header rotation on 403
#   3. Clear error message listing which sites block bots
#   4. Longer timeout

import os, re, time, requests
from pypdf import PdfReader
from bs4 import BeautifulSoup
from duckduckgo_search.exceptions import RatelimitException

MAX_PDF_BYTES = 10 * 1024 * 1024

# Rotate between two user-agent strings on retry
_HEADERS_LIST = [
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    },
    {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Connection": "keep-alive",
    },
]

# Sites known to block all bot traffic regardless of headers
_BLOCKED_DOMAINS = {"amazon.com", "www.amazon.com", "amazon.ca", "amazon.co.uk"}


class PDFIngestor:
    def __init__(self, chunk_size: int = 500, chunk_overlap: int = 80):
        self.chunk_size    = chunk_size
        self.chunk_overlap = chunk_overlap

    def _extract_text(self, path: str) -> list:
        reader = PdfReader(path)
        pages  = []
        for i, page in enumerate(reader.pages):
            text = (page.extract_text() or "").strip()
            if text:
                pages.append({"text": text, "page": i + 1})
        return pages

    def _chunk(self, page_data: list, source: str) -> list:
        chunks = []
        for pd in page_data:
            text  = re.sub(r"\s+", " ", pd["text"])
            words = text.split()
            start = 0
            while start < len(words):
                end   = min(start + self.chunk_size, len(words))
                chunk = " ".join(words[start:end])
                chunks.append({"page_content": chunk, "page": pd["page"], "source": source})
                start += self.chunk_size - self.chunk_overlap
        return chunks

    def ingest(self, path: str) -> list:
        size = os.path.getsize(path)
        if size > MAX_PDF_BYTES:
            raise ValueError(f"File exceeds 10 MB limit ({size/1024/1024:.1f} MB).")
        filename = os.path.basename(path)
        pages    = self._extract_text(path)
        return self._chunk(pages, filename)


class URLIngestor:
    def __init__(self, chunk_size: int = 500, chunk_overlap: int = 80):
        self.chunk_size    = chunk_size
        self.chunk_overlap = chunk_overlap

    def _check_blocked(self, url: str):
        from urllib.parse import urlparse
        domain = urlparse(url).netloc.lower()
        if domain in _BLOCKED_DOMAINS:
            raise ValueError(
                f"⛔ {domain} actively blocks all automated access (HTTP 403). "
                f"This is Amazon's anti-bot policy — no tool can bypass it. "
                f"Use their public help page via Google cache, or paste the text content manually."
            )

    def _fetch_text(self, url: str) -> str:
        last_error = None
        for i, headers in enumerate(_HEADERS_LIST):
            try:
                resp = requests.get(url, headers=headers, timeout=25, allow_redirects=True)
                if resp.status_code == 403:
                    raise requests.HTTPError(
                        f"403 Forbidden — this website blocks automated access. "
                        f"Try a different URL (Wikipedia, WHO, government sites, and news sites work well).",
                        response=resp
                    )
                resp.raise_for_status()
                soup = BeautifulSoup(resp.text, "lxml")
                for tag in soup(["script","style","nav","footer","header","aside","form","noscript","iframe"]):
                    tag.decompose()
                main = soup.find("main") or soup.find("article") or soup.find("body") or soup
                text = main.get_text(separator=" ", strip=True)
                text = re.sub(r"\s+", " ", text).strip()
                if len(text) > 200:
                    return text
            except requests.HTTPError:
                raise
            except Exception as e:
                last_error = e
                if i < len(_HEADERS_LIST) - 1:
                    time.sleep(2)
        raise ValueError(f"Could not fetch URL after {len(_HEADERS_LIST)} attempts. Last error: {last_error}")

    def _chunk(self, text: str, source: str) -> list:
        words  = text.split()
        chunks = []
        start  = 0
        page   = 1
        while start < len(words):
            end   = min(start + self.chunk_size, len(words))
            chunk = " ".join(words[start:end])
            chunks.append({"page_content": chunk, "page": page, "source": source})
            start += self.chunk_size - self.chunk_overlap
            page  += 1
        return chunks

    def ingest(self, url: str) -> list:
        self._check_blocked(url)
        text = self._fetch_text(url)
        if len(text) < 100:
            raise ValueError("Could not extract meaningful content. The page may require JavaScript or block bots.")
        words = text.split()
        if len(words) > 15000:
            text = " ".join(words[:15000])
        from urllib.parse import urlparse
        source = urlparse(url).netloc or url
        return self._chunk(text, source)


class SearchIngestor:
    def __init__(self):
        self._url_ingestor = URLIngestor()

    def _ddg_search(self, query: str, max_results: int = 5) -> list:
        from duckduckgo_search import DDGS
        last_error = None
        for attempt in range(3):
            try:
                with DDGS() as ddgs:
                    return list(ddgs.text(query, max_results=max_results))
            except RatelimitException as e:
                last_error = e
                time.sleep((attempt + 1) * 5)
            except Exception as e:
                raise ValueError(f"Search failed: {e}")
        raise ValueError(f"DuckDuckGo rate limited. Wait a few seconds and try again. ({last_error})")

    def search_and_ingest(self, query: str, site: str = "") -> dict:
        full_query = f"site:{site} {query}" if site.strip() else query
        hits       = self._ddg_search(full_query)
        if not hits:
            raise ValueError("No search results found for this query.")
        last_error = None
        for hit in hits:
            url = hit.get("href", "")
            if not url:
                continue
            try:
                chunks = self._url_ingestor.ingest(url)
                return {"url": url, "title": hit.get("title", url), "chunks": chunks}
            except Exception as e:
                last_error = e
                continue
        raise ValueError(f"Could not fetch any search result. Last error: {last_error}")


# ── PATCH: templates/index.html — replace Amazon demo card only ───────────
# Find this block in the demo-cards-grid div and replace it:
#
# OLD (Amazon card — 403 always):
#   <div class="demo-card" onclick="loadDemo(this)"
#        data-url="https://www.amazon.com/gp/help/customer/..."
#        data-q="What is the return window for electronics...">
#     ...🛒 Retail / Amazon Return Policy...
#   </div>
#
# NEW (FTC consumer rights — public government site, no bot blocking):

/*
      <div class="demo-card" onclick="loadDemo(this)"
           data-url="https://consumer.ftc.gov/articles/understanding-your-credit-billing-rights"
           data-q="What are the key consumer rights when disputing a charge on a credit card statement?">
        <div class="demo-card-icon">🛒</div>
        <div class="demo-card-industry" style="color:var(--gold)">Consumer Rights</div>
        <div class="demo-card-title">FTC — Credit Billing Rights</div>
        <div class="demo-card-q">"What are the key consumer rights when disputing a charge on a credit card statement?"</div>
        <div class="demo-card-meta">
          <span class="demo-card-tag" style="background:rgba(245,158,11,.12);color:var(--gold)">URL</span>
          <span class="demo-card-tag" style="background:rgba(79,142,247,.1);color:var(--accent)">consumer.ftc.gov</span>
        </div>
      </div>
*/