Spaces:
Sleeping
Sleeping
| ο»Ώ# ββ FILE: requirements.txt ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Added: openai (for HF router OpenAI-compatible client) | |
| flask==3.1.0 | |
| python-dotenv==1.0.1 | |
| langgraph==0.2.55 | |
| langchain==0.3.7 | |
| langchain-huggingface==0.1.2 | |
| langchain-core==0.3.21 | |
| langchain-community==0.3.7 | |
| huggingface-hub==0.26.2 | |
| sentence-transformers==3.3.1 | |
| faiss-cpu==1.9.0 | |
| rank-bm25==0.2.2 | |
| pypdf==5.1.0 | |
| duckduckgo-search==6.3.7 | |
| numpy==1.26.4 | |
| gunicorn==23.0.0 | |
| werkzeug==3.1.3 | |
| beautifulsoup4==4.12.3 | |
| lxml==5.3.0 | |
| openai==1.59.0 | |
| # ββ FILE: agents/llm_factory.py βββββββββββββββββββββββββββββββββββββββββββ | |
| # Uses OpenAI-compatible client pointed at router.huggingface.co/v1 | |
| # This is the officially documented method in HF docs as of 2026. | |
| import os | |
| from openai import OpenAI | |
| # HF router OpenAI-compatible endpoint β officially documented | |
| _HF_BASE_URL = "https://router.huggingface.co/v1" | |
| AVAILABLE_MODELS = { | |
| "llama3-8b": { | |
| "id": "meta-llama/Meta-Llama-3.1-8B-Instruct", | |
| "label": "Llama 3.1 8B (Meta)", | |
| "description": "Best balance of quality and speed. Most widely available on free-tier providers.", | |
| "speed": "fast", | |
| "params": "8B", | |
| }, | |
| "qwen25-7b": { | |
| "id": "Qwen/Qwen2.5-7B-Instruct", | |
| "label": "Qwen 2.5 7B (Alibaba)", | |
| "description": "Strong multilingual reasoning. Excellent for structured output and document analysis.", | |
| "speed": "fast", | |
| "params": "7B", | |
| }, | |
| "phi35-mini": { | |
| "id": "microsoft/Phi-3.5-mini-instruct", | |
| "label": "Phi-3.5 Mini (Microsoft)", | |
| "description": "3.8B params β fastest option. Good for simple Q&A and quick demos.", | |
| "speed": "fast", | |
| "params": "3.8B", | |
| }, | |
| "mistral-7b": { | |
| "id": "mistralai/Mistral-7B-Instruct-v0.3", | |
| "label": "Mistral 7B v0.3", | |
| "description": "Strong instruction following. Available via Sambanova on free credits.", | |
| "speed": "medium", | |
| "params": "7B", | |
| }, | |
| "gemma2-9b": { | |
| "id": "google/gemma-2-9b-it", | |
| "label": "Gemma 2 9B (Google)", | |
| "description": "Google's Gemma 2 instruction-tuned β strong factual grounding and reasoning.", | |
| "speed": "medium", | |
| "params": "9B", | |
| }, | |
| } | |
| _current_model_key = "llama3-8b" | |
| def get_current_model_key() -> str: | |
| return _current_model_key | |
| def set_current_model(key: str): | |
| global _current_model_key | |
| if key not in AVAILABLE_MODELS: | |
| raise ValueError(f"Unknown model key '{key}'. Valid: {list(AVAILABLE_MODELS)}") | |
| _current_model_key = key | |
| def get_current_model_id() -> str: | |
| return AVAILABLE_MODELS[_current_model_key]["id"] | |
| def call_llm(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7) -> str: | |
| """Call the HF router using OpenAI-compatible API β the official 2026 method.""" | |
| token = os.getenv("HF_TOKEN", "") | |
| if not token: | |
| raise EnvironmentError("HF_TOKEN is not set. Add your HuggingFace Read token in Space secrets or .env.") | |
| client = OpenAI(base_url=_HF_BASE_URL, api_key=token) | |
| model_id = get_current_model_id() | |
| response = client.chat.completions.create( | |
| model=model_id, | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=max_new_tokens, | |
| temperature=max(temperature, 0.01), | |
| ) | |
| return response.choices[0].message.content.strip() | |
| # ββ FILE: rag/ingestor.py βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Changes: | |
| # 1. Better browser-like headers to reduce 403s on public sites | |
| # 2. Retry with header rotation on 403 | |
| # 3. Clear error message listing which sites block bots | |
| # 4. Longer timeout | |
| import os, re, time, requests | |
| from pypdf import PdfReader | |
| from bs4 import BeautifulSoup | |
| from duckduckgo_search.exceptions import RatelimitException | |
| MAX_PDF_BYTES = 10 * 1024 * 1024 | |
| # Rotate between two user-agent strings on retry | |
| _HEADERS_LIST = [ | |
| { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.5", | |
| "Accept-Encoding": "gzip, deflate, br", | |
| "Connection": "keep-alive", | |
| "Upgrade-Insecure-Requests": "1", | |
| }, | |
| { | |
| "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Connection": "keep-alive", | |
| }, | |
| ] | |
| # Sites known to block all bot traffic regardless of headers | |
| _BLOCKED_DOMAINS = {"amazon.com", "www.amazon.com", "amazon.ca", "amazon.co.uk"} | |
| class PDFIngestor: | |
| def __init__(self, chunk_size: int = 500, chunk_overlap: int = 80): | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| def _extract_text(self, path: str) -> list: | |
| reader = PdfReader(path) | |
| pages = [] | |
| for i, page in enumerate(reader.pages): | |
| text = (page.extract_text() or "").strip() | |
| if text: | |
| pages.append({"text": text, "page": i + 1}) | |
| return pages | |
| def _chunk(self, page_data: list, source: str) -> list: | |
| chunks = [] | |
| for pd in page_data: | |
| text = re.sub(r"\s+", " ", pd["text"]) | |
| words = text.split() | |
| start = 0 | |
| while start < len(words): | |
| end = min(start + self.chunk_size, len(words)) | |
| chunk = " ".join(words[start:end]) | |
| chunks.append({"page_content": chunk, "page": pd["page"], "source": source}) | |
| start += self.chunk_size - self.chunk_overlap | |
| return chunks | |
| def ingest(self, path: str) -> list: | |
| size = os.path.getsize(path) | |
| if size > MAX_PDF_BYTES: | |
| raise ValueError(f"File exceeds 10 MB limit ({size/1024/1024:.1f} MB).") | |
| filename = os.path.basename(path) | |
| pages = self._extract_text(path) | |
| return self._chunk(pages, filename) | |
| class URLIngestor: | |
| def __init__(self, chunk_size: int = 500, chunk_overlap: int = 80): | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| def _check_blocked(self, url: str): | |
| from urllib.parse import urlparse | |
| domain = urlparse(url).netloc.lower() | |
| if domain in _BLOCKED_DOMAINS: | |
| raise ValueError( | |
| f"β {domain} actively blocks all automated access (HTTP 403). " | |
| f"This is Amazon's anti-bot policy β no tool can bypass it. " | |
| f"Use their public help page via Google cache, or paste the text content manually." | |
| ) | |
| def _fetch_text(self, url: str) -> str: | |
| last_error = None | |
| for i, headers in enumerate(_HEADERS_LIST): | |
| try: | |
| resp = requests.get(url, headers=headers, timeout=25, allow_redirects=True) | |
| if resp.status_code == 403: | |
| raise requests.HTTPError( | |
| f"403 Forbidden β this website blocks automated access. " | |
| f"Try a different URL (Wikipedia, WHO, government sites, and news sites work well).", | |
| response=resp | |
| ) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "lxml") | |
| for tag in soup(["script","style","nav","footer","header","aside","form","noscript","iframe"]): | |
| tag.decompose() | |
| main = soup.find("main") or soup.find("article") or soup.find("body") or soup | |
| text = main.get_text(separator=" ", strip=True) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| if len(text) > 200: | |
| return text | |
| except requests.HTTPError: | |
| raise | |
| except Exception as e: | |
| last_error = e | |
| if i < len(_HEADERS_LIST) - 1: | |
| time.sleep(2) | |
| raise ValueError(f"Could not fetch URL after {len(_HEADERS_LIST)} attempts. Last error: {last_error}") | |
| def _chunk(self, text: str, source: str) -> list: | |
| words = text.split() | |
| chunks = [] | |
| start = 0 | |
| page = 1 | |
| while start < len(words): | |
| end = min(start + self.chunk_size, len(words)) | |
| chunk = " ".join(words[start:end]) | |
| chunks.append({"page_content": chunk, "page": page, "source": source}) | |
| start += self.chunk_size - self.chunk_overlap | |
| page += 1 | |
| return chunks | |
| def ingest(self, url: str) -> list: | |
| self._check_blocked(url) | |
| text = self._fetch_text(url) | |
| if len(text) < 100: | |
| raise ValueError("Could not extract meaningful content. The page may require JavaScript or block bots.") | |
| words = text.split() | |
| if len(words) > 15000: | |
| text = " ".join(words[:15000]) | |
| from urllib.parse import urlparse | |
| source = urlparse(url).netloc or url | |
| return self._chunk(text, source) | |
| class SearchIngestor: | |
| def __init__(self): | |
| self._url_ingestor = URLIngestor() | |
| def _ddg_search(self, query: str, max_results: int = 5) -> list: | |
| from duckduckgo_search import DDGS | |
| last_error = None | |
| for attempt in range(3): | |
| try: | |
| with DDGS() as ddgs: | |
| return list(ddgs.text(query, max_results=max_results)) | |
| except RatelimitException as e: | |
| last_error = e | |
| time.sleep((attempt + 1) * 5) | |
| except Exception as e: | |
| raise ValueError(f"Search failed: {e}") | |
| raise ValueError(f"DuckDuckGo rate limited. Wait a few seconds and try again. ({last_error})") | |
| def search_and_ingest(self, query: str, site: str = "") -> dict: | |
| full_query = f"site:{site} {query}" if site.strip() else query | |
| hits = self._ddg_search(full_query) | |
| if not hits: | |
| raise ValueError("No search results found for this query.") | |
| last_error = None | |
| for hit in hits: | |
| url = hit.get("href", "") | |
| if not url: | |
| continue | |
| try: | |
| chunks = self._url_ingestor.ingest(url) | |
| return {"url": url, "title": hit.get("title", url), "chunks": chunks} | |
| except Exception as e: | |
| last_error = e | |
| continue | |
| raise ValueError(f"Could not fetch any search result. Last error: {last_error}") | |
| # ββ PATCH: templates/index.html β replace Amazon demo card only βββββββββββ | |
| # Find this block in the demo-cards-grid div and replace it: | |
| # | |
| # OLD (Amazon card β 403 always): | |
| # <div class="demo-card" onclick="loadDemo(this)" | |
| # data-url="https://www.amazon.com/gp/help/customer/..." | |
| # data-q="What is the return window for electronics..."> | |
| # ...π Retail / Amazon Return Policy... | |
| # </div> | |
| # | |
| # NEW (FTC consumer rights β public government site, no bot blocking): | |
| /* | |
| <div class="demo-card" onclick="loadDemo(this)" | |
| data-url="https://consumer.ftc.gov/articles/understanding-your-credit-billing-rights" | |
| data-q="What are the key consumer rights when disputing a charge on a credit card statement?"> | |
| <div class="demo-card-icon">π</div> | |
| <div class="demo-card-industry" style="color:var(--gold)">Consumer Rights</div> | |
| <div class="demo-card-title">FTC β Credit Billing Rights</div> | |
| <div class="demo-card-q">"What are the key consumer rights when disputing a charge on a credit card statement?"</div> | |
| <div class="demo-card-meta"> | |
| <span class="demo-card-tag" style="background:rgba(245,158,11,.12);color:var(--gold)">URL</span> | |
| <span class="demo-card-tag" style="background:rgba(79,142,247,.1);color:var(--accent)">consumer.ftc.gov</span> | |
| </div> | |
| </div> | |
| */ |