Spaces:

Naveen-2007
/

perplexity-clone

Running

Naveen-2007 commited on Dec 3, 2025

Commit

05a9f57

1 Parent(s): 3021956

Fix: Lightweight requirements for Azure - CPU-only PyTorch

Files changed (3) hide show

requirements.txt CHANGED Viewed

@@ -1,41 +1,53 @@
-# Core LLM + tools
-langchain
-langchain-core
-langchain-community
-langgraph
-# LLM providers (Groq via LangChain)
-langchain-groq
-# Web API backend
-fastapi
-uvicorn[standard]
-pydantic
-python-dotenv
-python-multipart
-aiofiles
-# Streamlit Frontend
-streamlit
-# HTTP & Networking
-requests
-httpx
-# Embeddings + vector search
-sentence-transformers
-faiss-cpu
-# Web search + HTTP
-tavily-python
-# Scraping
-trafilatura
-beautifulsoup4
-# Wikipedia tool dependency
-wikipedia
-# PDF/text load support
-pypdf
-python-pptx

+# =============================================
+# LIGHTWEIGHT REQUIREMENTS FOR AZURE APP SERVICE
+# =============================================
+# Core LangChain (minimal)
+langchain==0.1.20
+langchain-core==0.1.52
+langchain-community==0.0.38
+langgraph==0.0.55
+# Groq LLM
+langchain-groq==0.1.3
+# Web API
+fastapi==0.110.0
+uvicorn==0.27.1
+pydantic==2.6.1
+python-dotenv==1.0.1
+python-multipart==0.0.9
+aiofiles==23.2.1
+# Streamlit
+streamlit==1.31.1
+# HTTP
+requests==2.31.0
+httpx==0.26.0
+# Embeddings - USE CPU-ONLY TORCH (smaller)
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.2.0+cpu
+sentence-transformers==2.3.1
+# Vector search
+faiss-cpu==1.7.4
+# Web search
+tavily-python==0.3.3
+# Scraping (lightweight)
+beautifulsoup4==4.12.3
+lxml==5.1.0
+# Wikipedia
+wikipedia==1.4.0
+# Document processing
+pypdf==4.0.1
+python-pptx==0.6.23
+# Required by langchain
+tiktoken==0.6.0
+numpy==1.26.4

startup.sh ADDED Viewed

+#!/bin/bash
+# Start FastAPI backend on port 8000 in background
+uvicorn app.api:app --host 0.0.0.0 --port 8000 &
+# Start Streamlit frontend on port 8501 (Azure will use this)
+streamlit run streamlit_app.py --server.port 8501 --server.address 0.0.0.0 --server.headless true

tools/browse_tool.py CHANGED Viewed

@@ -1,5 +1,12 @@
 import requests
-import trafilatura
 class BrowseTool:
@@ -7,12 +14,29 @@ class BrowseTool:
     def fetch_clean(self, url: str) -> str:
         try:
-            resp = requests.get(url, timeout=20)
             resp.raise_for_status()
             html = resp.text
-            text = trafilatura.extract(
-                html, include_comments=False, include_tables=False
-            )
             return text or ""
-        except Exception:
             return ""

 import requests
+from bs4 import BeautifulSoup
+# Try to import trafilatura, fallback to BeautifulSoup if not available
+try:
+    import trafilatura
+    HAS_TRAFILATURA = True
+except ImportError:
+    HAS_TRAFILATURA = False
 class BrowseTool:
     def fetch_clean(self, url: str) -> str:
         try:
+            resp = requests.get(url, timeout=20, headers={
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+            })
             resp.raise_for_status()
             html = resp.text
+            # Use trafilatura if available, otherwise fallback to BeautifulSoup
+            if HAS_TRAFILATURA:
+                text = trafilatura.extract(
+                    html, include_comments=False, include_tables=False
+                )
+            else:
+                # Fallback: use BeautifulSoup
+                soup = BeautifulSoup(html, 'lxml')
+                # Remove script and style elements
+                for element in soup(['script', 'style', 'nav', 'footer', 'header']):
+                    element.decompose()
+                text = soup.get_text(separator='\n', strip=True)
+                # Clean up extra whitespace
+                lines = [line.strip() for line in text.splitlines() if line.strip()]
+                text = '\n'.join(lines[:100])  # Limit to first 100 lines
             return text or ""
+        except Exception as e:
+            print(f"Browse error: {e}")
             return ""