PraneshJs commited on
Commit
fe7e039
·
verified ·
1 Parent(s): be25aad

Added files

Browse files
Files changed (4) hide show
  1. Dockerfile +51 -0
  2. app.py +207 -0
  3. requirements.txt +13 -0
  4. start.sh +3 -0
Dockerfile ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Playwright python image (Chromium + deps preinstalled)
2
+ FROM mcr.microsoft.com/playwright/python:v1.44.0-jammy
3
+
4
+ ENV PYTHONUNBUFFERED=1
5
+ WORKDIR /app
6
+
7
+ # Fix SSL/TLS issues
8
+ ENV PYTHONHTTPSVERIFY=0
9
+ ENV SSL_VERIFY=false
10
+ ENV REQUESTS_CA_BUNDLE=""
11
+ ENV CURL_CA_BUNDLE=""
12
+
13
+ # Update certificates and install network tools
14
+ RUN apt-get update && apt-get install -y \
15
+ ca-certificates \
16
+ curl \
17
+ wget \
18
+ openssl \
19
+ && update-ca-certificates \
20
+ && apt-get clean \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ # Copy requirements first to leverage caching
24
+ COPY requirements.txt /app/requirements.txt
25
+
26
+ # Install Python dependencies with SSL fixes
27
+ RUN pip install --no-cache-dir --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org -r requirements.txt
28
+
29
+ # Install Playwright browser with dependencies
30
+ RUN python -m playwright install --with-deps chromium
31
+
32
+ # Run Crawl4AI setup
33
+ RUN crawl4ai-setup
34
+
35
+ # Run Crawl4AI doctor (diagnostics)
36
+ RUN crawl4ai-doctor || true
37
+
38
+ # Copy rest of application
39
+ COPY . /app
40
+
41
+ # Make start script executable
42
+ RUN chmod +x /app/start.sh
43
+
44
+ # Use non-root user that Playwright image provides
45
+ USER pwuser
46
+
47
+ # Expose port for Render
48
+ EXPOSE 5000
49
+
50
+ # Start backend with gunicorn
51
+ CMD ["/app/start.sh"]
app.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ import requests
4
+ import urllib3
5
+ from openai import AzureOpenAI
6
+ from dotenv import load_dotenv
7
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
8
+ from crawl4ai.content_filter_strategy import PruningContentFilter
9
+ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
10
+ from fastapi import FastAPI, HTTPException
11
+ from fastapi.staticfiles import StaticFiles
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from pydantic import BaseModel
14
+ import uvicorn
15
+ import json
16
+
17
+ # Disable SSL warnings
18
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
19
+
20
+ load_dotenv()
21
+
22
+ # Initialize FastAPI app
23
+ app = FastAPI(title="Search Assistant API")
24
+
25
+ # Add CORS middleware
26
+ app.add_middleware(
27
+ CORSMiddleware,
28
+ allow_origins=["*"], # In production, replace with specific origins
29
+ allow_credentials=True,
30
+ allow_methods=["*"],
31
+ allow_headers=["*"],
32
+ )
33
+
34
+ # Mount static files
35
+ # app.mount("/static", StaticFiles(directory="static"), name="static")
36
+
37
+ # Initialize Azure OpenAI client
38
+ client = AzureOpenAI(
39
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
40
+ api_version="2025-01-01-preview",
41
+ azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
42
+ )
43
+
44
+ SERPER_API_KEY = os.getenv("SERPER_API_KEY")
45
+ DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT")
46
+
47
+ class SearchRequest(BaseModel):
48
+ question: str
49
+ mode: str = "quick" # "quick" or "deep"
50
+
51
+ class SearchResponse(BaseModel):
52
+ answer: str
53
+ sources: list
54
+ mode: str
55
+ status: str = "success"
56
+
57
+ def search_serper(query):
58
+ headers = {
59
+ "X-API-KEY": SERPER_API_KEY,
60
+ "Content-Type": "application/json"
61
+ }
62
+ payload = {"q": query}
63
+ response = requests.post("https://google.serper.dev/search", headers=headers, json=payload, verify=False)
64
+ results = response.json()
65
+
66
+ # Return both snippets and URLs for crawling
67
+ search_results = []
68
+ for result in results.get("organic", [])[:3]: # Limit to top 3 for crawling
69
+ title = result.get("title", "")
70
+ snippet = result.get("snippet", "")
71
+ url = result.get("link", "")
72
+ search_results.append({
73
+ "title": title,
74
+ "snippet": snippet,
75
+ "url": url
76
+ })
77
+
78
+ return search_results
79
+
80
+ async def crawl_to_markdown(url: str) -> str:
81
+ """Crawl a URL and return its content as markdown."""
82
+ try:
83
+ browser_conf = BrowserConfig(headless=True, verbose=False)
84
+ filter_strategy = PruningContentFilter()
85
+ md_gen = DefaultMarkdownGenerator(content_filter=filter_strategy)
86
+ run_conf = CrawlerRunConfig(markdown_generator=md_gen)
87
+
88
+ async with AsyncWebCrawler(config=browser_conf) as crawler:
89
+ result = await crawler.arun(url=url, config=run_conf)
90
+ return result.markdown.fit_markdown or result.markdown.raw_markdown or ""
91
+ except Exception as e:
92
+ return f"Crawl error for {url}: {str(e)}"
93
+
94
+ async def generate_answer_with_crawling(question):
95
+ """Generate answer using search results and crawled content."""
96
+ try:
97
+ # 1. Get search results
98
+ search_results = search_serper(question)
99
+
100
+ # 2. Crawl each URL to get full content
101
+ crawled_content = []
102
+ for result in search_results:
103
+ url = result["url"]
104
+ title = result["title"]
105
+
106
+ print(f"Crawling: {title} ({url})")
107
+ markdown_content = await crawl_to_markdown(url)
108
+
109
+ # Limit content to avoid token limits
110
+ content_snippet = markdown_content[:2000] if markdown_content else result["snippet"]
111
+ crawled_content.append(f"## {title}\nSource: {url}\n\n{content_snippet}\n\n")
112
+
113
+ # 3. Combine all content for context
114
+ full_context = "\n".join(crawled_content)
115
+
116
+ messages = [
117
+ {"role": "system", "content": "You are a helpful assistant that answers questions using detailed web content. Provide citations with URLs when possible."},
118
+ {"role": "user", "content": f"Based on the following web content, answer the question. Include relevant citations.\n\nContent:\n{full_context}\n\nQuestion: {question}"}
119
+ ]
120
+
121
+ response = client.chat.completions.create(
122
+ model=DEPLOYMENT_NAME,
123
+ messages=messages,
124
+ temperature=0.8,
125
+ max_tokens=800
126
+ )
127
+ return response.choices[0].message.content, search_results
128
+
129
+ except Exception as e:
130
+ return f"Error: {str(e)}", []
131
+
132
+ def generate_answer(question):
133
+ """Original function using just search snippets."""
134
+ search_results = search_serper(question)
135
+
136
+ snippets = []
137
+ for result in search_results:
138
+ title = result["title"]
139
+ snippet = result["snippet"]
140
+ url = result["url"]
141
+ snippets.append(f"{title}: {snippet} ({url})")
142
+
143
+ context = "\n".join(snippets)
144
+ messages = [
145
+ {"role": "system", "content": "You are a helpful assistant that answers using real-time search context."},
146
+ {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}
147
+ ]
148
+ response = client.chat.completions.create(
149
+ model=DEPLOYMENT_NAME,
150
+ messages=messages,
151
+ temperature=0.8,
152
+ max_tokens=800
153
+ )
154
+ return response.choices[0].message.content
155
+
156
+ # API Endpoints
157
+ @app.get("/")
158
+ async def root():
159
+ return {"status": "ok"}
160
+
161
+
162
+ @app.post("/search")
163
+ async def search_endpoint(request: SearchRequest):
164
+ """Search endpoint that returns JSON response."""
165
+ try:
166
+ print(f"\n🔍 Search Request:")
167
+ print(f"Question: {request.question}")
168
+ print(f"Mode: {request.mode}")
169
+
170
+ if request.mode == "deep":
171
+ print("🕷️ Starting deep search with web crawling...")
172
+ answer, sources = await generate_answer_with_crawling(request.question)
173
+ else:
174
+ print("⚡ Starting quick search...")
175
+ answer = generate_answer(request.question)
176
+ sources = search_serper(request.question)
177
+
178
+ response_data = {
179
+ "answer": answer,
180
+ "sources": sources,
181
+ "mode": request.mode,
182
+ "status": "success"
183
+ }
184
+
185
+ print(f"\n📋 Response Data:")
186
+ print(json.dumps(response_data, indent=2))
187
+
188
+ return response_data
189
+
190
+ except Exception as e:
191
+ error_response = {
192
+ "answer": f"Error: {str(e)}",
193
+ "sources": [],
194
+ "mode": request.mode,
195
+ "status": "error"
196
+ }
197
+
198
+ print(f"\n❌ Error Response:")
199
+ print(json.dumps(error_response, indent=2))
200
+
201
+ raise HTTPException(status_code=500, detail=error_response)
202
+
203
+ if __name__ == "__main__":
204
+ port = int(os.getenv("PORT", 5000))
205
+ print("🚀 Starting Search Assistant Server...")
206
+ print(f"📱 Port: {port}")
207
+ uvicorn.run(app, host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ flask-cors
3
+ crawl4ai
4
+ gunicorn
5
+ playwright
6
+ openai>=1.0.0
7
+ requests
8
+ python-dotenv
9
+ fastapi
10
+ uvicorn[standard]
11
+ pydantic
12
+ urllib3
13
+ certifi
start.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ PORT=${PORT:-5000}
3
+ exec uvicorn app:app --host 0.0.0.0 --port $PORT --workers 1