# ── FILE: requirements.txt ──────────────────────────────────────────────── # Added: openai (for HF router OpenAI-compatible client) flask==3.1.0 python-dotenv==1.0.1 langgraph==0.2.55 langchain==0.3.7 langchain-huggingface==0.1.2 langchain-core==0.3.21 langchain-community==0.3.7 huggingface-hub==0.26.2 sentence-transformers==3.3.1 faiss-cpu==1.9.0 rank-bm25==0.2.2 pypdf==5.1.0 duckduckgo-search==6.3.7 numpy==1.26.4 gunicorn==23.0.0 werkzeug==3.1.3 beautifulsoup4==4.12.3 lxml==5.3.0 openai==1.59.0 # ── FILE: agents/llm_factory.py ─────────────────────────────────────────── # Uses OpenAI-compatible client pointed at router.huggingface.co/v1 # This is the officially documented method in HF docs as of 2026. import os from openai import OpenAI # HF router OpenAI-compatible endpoint — officially documented _HF_BASE_URL = "https://router.huggingface.co/v1" AVAILABLE_MODELS = { "llama3-8b": { "id": "meta-llama/Meta-Llama-3.1-8B-Instruct", "label": "Llama 3.1 8B (Meta)", "description": "Best balance of quality and speed. Most widely available on free-tier providers.", "speed": "fast", "params": "8B", }, "qwen25-7b": { "id": "Qwen/Qwen2.5-7B-Instruct", "label": "Qwen 2.5 7B (Alibaba)", "description": "Strong multilingual reasoning. Excellent for structured output and document analysis.", "speed": "fast", "params": "7B", }, "phi35-mini": { "id": "microsoft/Phi-3.5-mini-instruct", "label": "Phi-3.5 Mini (Microsoft)", "description": "3.8B params — fastest option. Good for simple Q&A and quick demos.", "speed": "fast", "params": "3.8B", }, "mistral-7b": { "id": "mistralai/Mistral-7B-Instruct-v0.3", "label": "Mistral 7B v0.3", "description": "Strong instruction following. Available via Sambanova on free credits.", "speed": "medium", "params": "7B", }, "gemma2-9b": { "id": "google/gemma-2-9b-it", "label": "Gemma 2 9B (Google)", "description": "Google's Gemma 2 instruction-tuned — strong factual grounding and reasoning.", "speed": "medium", "params": "9B", }, } _current_model_key = "llama3-8b" def get_current_model_key() -> str: return _current_model_key def set_current_model(key: str): global _current_model_key if key not in AVAILABLE_MODELS: raise ValueError(f"Unknown model key '{key}'. Valid: {list(AVAILABLE_MODELS)}") _current_model_key = key def get_current_model_id() -> str: return AVAILABLE_MODELS[_current_model_key]["id"] def call_llm(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7) -> str: """Call the HF router using OpenAI-compatible API — the official 2026 method.""" token = os.getenv("HF_TOKEN", "") if not token: raise EnvironmentError("HF_TOKEN is not set. Add your HuggingFace Read token in Space secrets or .env.") client = OpenAI(base_url=_HF_BASE_URL, api_key=token) model_id = get_current_model_id() response = client.chat.completions.create( model=model_id, messages=[{"role": "user", "content": prompt}], max_tokens=max_new_tokens, temperature=max(temperature, 0.01), ) return response.choices[0].message.content.strip() # ── FILE: rag/ingestor.py ───────────────────────────────────────────────── # Changes: # 1. Better browser-like headers to reduce 403s on public sites # 2. Retry with header rotation on 403 # 3. Clear error message listing which sites block bots # 4. Longer timeout import os, re, time, requests from pypdf import PdfReader from bs4 import BeautifulSoup from duckduckgo_search.exceptions import RatelimitException MAX_PDF_BYTES = 10 * 1024 * 1024 # Rotate between two user-agent strings on retry _HEADERS_LIST = [ { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", }, { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", "Connection": "keep-alive", }, ] # Sites known to block all bot traffic regardless of headers _BLOCKED_DOMAINS = {"amazon.com", "www.amazon.com", "amazon.ca", "amazon.co.uk"} class PDFIngestor: def __init__(self, chunk_size: int = 500, chunk_overlap: int = 80): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap def _extract_text(self, path: str) -> list: reader = PdfReader(path) pages = [] for i, page in enumerate(reader.pages): text = (page.extract_text() or "").strip() if text: pages.append({"text": text, "page": i + 1}) return pages def _chunk(self, page_data: list, source: str) -> list: chunks = [] for pd in page_data: text = re.sub(r"\s+", " ", pd["text"]) words = text.split() start = 0 while start < len(words): end = min(start + self.chunk_size, len(words)) chunk = " ".join(words[start:end]) chunks.append({"page_content": chunk, "page": pd["page"], "source": source}) start += self.chunk_size - self.chunk_overlap return chunks def ingest(self, path: str) -> list: size = os.path.getsize(path) if size > MAX_PDF_BYTES: raise ValueError(f"File exceeds 10 MB limit ({size/1024/1024:.1f} MB).") filename = os.path.basename(path) pages = self._extract_text(path) return self._chunk(pages, filename) class URLIngestor: def __init__(self, chunk_size: int = 500, chunk_overlap: int = 80): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap def _check_blocked(self, url: str): from urllib.parse import urlparse domain = urlparse(url).netloc.lower() if domain in _BLOCKED_DOMAINS: raise ValueError( f"⛔ {domain} actively blocks all automated access (HTTP 403). " f"This is Amazon's anti-bot policy — no tool can bypass it. " f"Use their public help page via Google cache, or paste the text content manually." ) def _fetch_text(self, url: str) -> str: last_error = None for i, headers in enumerate(_HEADERS_LIST): try: resp = requests.get(url, headers=headers, timeout=25, allow_redirects=True) if resp.status_code == 403: raise requests.HTTPError( f"403 Forbidden — this website blocks automated access. " f"Try a different URL (Wikipedia, WHO, government sites, and news sites work well).", response=resp ) resp.raise_for_status() soup = BeautifulSoup(resp.text, "lxml") for tag in soup(["script","style","nav","footer","header","aside","form","noscript","iframe"]): tag.decompose() main = soup.find("main") or soup.find("article") or soup.find("body") or soup text = main.get_text(separator=" ", strip=True) text = re.sub(r"\s+", " ", text).strip() if len(text) > 200: return text except requests.HTTPError: raise except Exception as e: last_error = e if i < len(_HEADERS_LIST) - 1: time.sleep(2) raise ValueError(f"Could not fetch URL after {len(_HEADERS_LIST)} attempts. Last error: {last_error}") def _chunk(self, text: str, source: str) -> list: words = text.split() chunks = [] start = 0 page = 1 while start < len(words): end = min(start + self.chunk_size, len(words)) chunk = " ".join(words[start:end]) chunks.append({"page_content": chunk, "page": page, "source": source}) start += self.chunk_size - self.chunk_overlap page += 1 return chunks def ingest(self, url: str) -> list: self._check_blocked(url) text = self._fetch_text(url) if len(text) < 100: raise ValueError("Could not extract meaningful content. The page may require JavaScript or block bots.") words = text.split() if len(words) > 15000: text = " ".join(words[:15000]) from urllib.parse import urlparse source = urlparse(url).netloc or url return self._chunk(text, source) class SearchIngestor: def __init__(self): self._url_ingestor = URLIngestor() def _ddg_search(self, query: str, max_results: int = 5) -> list: from duckduckgo_search import DDGS last_error = None for attempt in range(3): try: with DDGS() as ddgs: return list(ddgs.text(query, max_results=max_results)) except RatelimitException as e: last_error = e time.sleep((attempt + 1) * 5) except Exception as e: raise ValueError(f"Search failed: {e}") raise ValueError(f"DuckDuckGo rate limited. Wait a few seconds and try again. ({last_error})") def search_and_ingest(self, query: str, site: str = "") -> dict: full_query = f"site:{site} {query}" if site.strip() else query hits = self._ddg_search(full_query) if not hits: raise ValueError("No search results found for this query.") last_error = None for hit in hits: url = hit.get("href", "") if not url: continue try: chunks = self._url_ingestor.ingest(url) return {"url": url, "title": hit.get("title", url), "chunks": chunks} except Exception as e: last_error = e continue raise ValueError(f"Could not fetch any search result. Last error: {last_error}") # ── PATCH: templates/index.html — replace Amazon demo card only ─────────── # Find this block in the demo-cards-grid div and replace it: # # OLD (Amazon card — 403 always): #
# ...🛒 Retail / Amazon Return Policy... #
# # NEW (FTC consumer rights — public government site, no bot blocking): /*
🛒
Consumer Rights
FTC — Credit Billing Rights
"What are the key consumer rights when disputing a charge on a credit card statement?"
URL consumer.ftc.gov
*/