Naveen-2007 commited on
Commit
05a9f57
·
1 Parent(s): 3021956

Fix: Lightweight requirements for Azure - CPU-only PyTorch

Browse files
Files changed (3) hide show
  1. requirements.txt +53 -41
  2. startup.sh +7 -0
  3. tools/browse_tool.py +30 -6
requirements.txt CHANGED
@@ -1,41 +1,53 @@
1
- # Core LLM + tools
2
- langchain
3
- langchain-core
4
- langchain-community
5
- langgraph
6
-
7
- # LLM providers (Groq via LangChain)
8
- langchain-groq
9
-
10
- # Web API backend
11
- fastapi
12
- uvicorn[standard]
13
- pydantic
14
- python-dotenv
15
- python-multipart
16
- aiofiles
17
-
18
- # Streamlit Frontend
19
- streamlit
20
-
21
- # HTTP & Networking
22
- requests
23
- httpx
24
-
25
- # Embeddings + vector search
26
- sentence-transformers
27
- faiss-cpu
28
-
29
- # Web search + HTTP
30
- tavily-python
31
-
32
- # Scraping
33
- trafilatura
34
- beautifulsoup4
35
-
36
- # Wikipedia tool dependency
37
- wikipedia
38
-
39
- # PDF/text load support
40
- pypdf
41
- python-pptx
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================
2
+ # LIGHTWEIGHT REQUIREMENTS FOR AZURE APP SERVICE
3
+ # =============================================
4
+
5
+ # Core LangChain (minimal)
6
+ langchain==0.1.20
7
+ langchain-core==0.1.52
8
+ langchain-community==0.0.38
9
+ langgraph==0.0.55
10
+
11
+ # Groq LLM
12
+ langchain-groq==0.1.3
13
+
14
+ # Web API
15
+ fastapi==0.110.0
16
+ uvicorn==0.27.1
17
+ pydantic==2.6.1
18
+ python-dotenv==1.0.1
19
+ python-multipart==0.0.9
20
+ aiofiles==23.2.1
21
+
22
+ # Streamlit
23
+ streamlit==1.31.1
24
+
25
+ # HTTP
26
+ requests==2.31.0
27
+ httpx==0.26.0
28
+
29
+ # Embeddings - USE CPU-ONLY TORCH (smaller)
30
+ --extra-index-url https://download.pytorch.org/whl/cpu
31
+ torch==2.2.0+cpu
32
+ sentence-transformers==2.3.1
33
+
34
+ # Vector search
35
+ faiss-cpu==1.7.4
36
+
37
+ # Web search
38
+ tavily-python==0.3.3
39
+
40
+ # Scraping (lightweight)
41
+ beautifulsoup4==4.12.3
42
+ lxml==5.1.0
43
+
44
+ # Wikipedia
45
+ wikipedia==1.4.0
46
+
47
+ # Document processing
48
+ pypdf==4.0.1
49
+ python-pptx==0.6.23
50
+
51
+ # Required by langchain
52
+ tiktoken==0.6.0
53
+ numpy==1.26.4
startup.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Start FastAPI backend on port 8000 in background
4
+ uvicorn app.api:app --host 0.0.0.0 --port 8000 &
5
+
6
+ # Start Streamlit frontend on port 8501 (Azure will use this)
7
+ streamlit run streamlit_app.py --server.port 8501 --server.address 0.0.0.0 --server.headless true
tools/browse_tool.py CHANGED
@@ -1,5 +1,12 @@
1
  import requests
2
- import trafilatura
 
 
 
 
 
 
 
3
 
4
 
5
  class BrowseTool:
@@ -7,12 +14,29 @@ class BrowseTool:
7
 
8
  def fetch_clean(self, url: str) -> str:
9
  try:
10
- resp = requests.get(url, timeout=20)
 
 
11
  resp.raise_for_status()
12
  html = resp.text
13
- text = trafilatura.extract(
14
- html, include_comments=False, include_tables=False
15
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  return text or ""
17
- except Exception:
 
18
  return ""
 
1
  import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ # Try to import trafilatura, fallback to BeautifulSoup if not available
5
+ try:
6
+ import trafilatura
7
+ HAS_TRAFILATURA = True
8
+ except ImportError:
9
+ HAS_TRAFILATURA = False
10
 
11
 
12
  class BrowseTool:
 
14
 
15
  def fetch_clean(self, url: str) -> str:
16
  try:
17
+ resp = requests.get(url, timeout=20, headers={
18
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
19
+ })
20
  resp.raise_for_status()
21
  html = resp.text
22
+
23
+ # Use trafilatura if available, otherwise fallback to BeautifulSoup
24
+ if HAS_TRAFILATURA:
25
+ text = trafilatura.extract(
26
+ html, include_comments=False, include_tables=False
27
+ )
28
+ else:
29
+ # Fallback: use BeautifulSoup
30
+ soup = BeautifulSoup(html, 'lxml')
31
+ # Remove script and style elements
32
+ for element in soup(['script', 'style', 'nav', 'footer', 'header']):
33
+ element.decompose()
34
+ text = soup.get_text(separator='\n', strip=True)
35
+ # Clean up extra whitespace
36
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
37
+ text = '\n'.join(lines[:100]) # Limit to first 100 lines
38
+
39
  return text or ""
40
+ except Exception as e:
41
+ print(f"Browse error: {e}")
42
  return ""