Madras1 commited on
Commit
55cd806
·
verified ·
1 Parent(s): 676efb5

Upload 40 files

Browse files
app/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """Lancer - Advanced AI Search API"""
2
+
3
+ __version__ = "0.1.0"
app/agents/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Agents module."""
app/agents/browser_agent.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Browser Agent - Chrome with live stream and agent memory.
2
+
3
+ Uses E2B Desktop sandbox with Chrome browser.
4
+ Time limit: 5 minutes (300 seconds)
5
+ Shows live video stream.
6
+ Includes full memory/history tracking via AgentState.
7
+ """
8
+
9
+ import os
10
+ import json
11
+ import shlex
12
+ import logging
13
+ import base64
14
+ import time
15
+ from typing import AsyncGenerator, Optional
16
+
17
+ from app.config import get_settings
18
+ from app.agents.llm_client import generate_completion
19
+ from app.agents.graph.state import AgentState, NodeType
20
+ from app.agents.flaresolverr import is_cloudflare_blocked
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ MAX_TIME_SECONDS = 300 # 5 minutes
25
+
26
+
27
+ async def run_browser_agent(
28
+ task: str,
29
+ url: Optional[str] = None,
30
+ ) -> AsyncGenerator[dict, None]:
31
+ """Run browser agent with Chrome and live stream."""
32
+ settings = get_settings()
33
+
34
+ if not settings.e2b_api_key:
35
+ yield {"type": "error", "message": "E2B_API_KEY not configured"}
36
+ return
37
+
38
+ # Initialize agent state with memory
39
+ state = AgentState(
40
+ task=task,
41
+ url=url,
42
+ timeout_seconds=MAX_TIME_SECONDS,
43
+ start_time=time.time()
44
+ )
45
+
46
+ yield {"type": "status", "message": "🚀 Initializing agent..."}
47
+
48
+ desktop = None
49
+
50
+ try:
51
+ from e2b_desktop import Sandbox
52
+
53
+ os.environ["E2B_API_KEY"] = settings.e2b_api_key
54
+
55
+ yield {"type": "status", "message": "🖥️ Creating virtual desktop..."}
56
+ desktop = Sandbox.create(timeout=600)
57
+ state.desktop = desktop
58
+
59
+ # Start streaming
60
+ stream_url = None
61
+ try:
62
+ desktop.stream.start(require_auth=True)
63
+ auth_key = desktop.stream.get_auth_key()
64
+ stream_url = desktop.stream.get_url(auth_key=auth_key)
65
+ yield {"type": "stream", "url": stream_url}
66
+ logger.info(f"Stream started: {stream_url}")
67
+ desktop.wait(2000)
68
+ except Exception as e:
69
+ logger.warning(f"Could not start stream: {e}")
70
+
71
+ # Launch Chrome
72
+ yield {"type": "status", "message": "🌐 Launching browser..."}
73
+
74
+ if url:
75
+ start_url = url
76
+ else:
77
+ search_query = task.replace(' ', '+')
78
+ start_url = f"https://html.duckduckgo.com/html/?q={search_query}"
79
+
80
+ chrome_flags = "--no-sandbox --disable-gpu --start-maximized --no-first-run --disable-default-apps --disable-popup-blocking --disable-translate --no-default-browser-check"
81
+ desktop.commands.run(f"google-chrome {chrome_flags} {shlex.quote(start_url)} &", background=True)
82
+ desktop.wait(3000)
83
+
84
+ # Close dialogs
85
+ desktop.press("enter")
86
+ desktop.wait(1000)
87
+
88
+ # Add to memory
89
+ state.visited_urls.append(start_url)
90
+ state.add_action({"type": "navigate", "url": start_url})
91
+
92
+ # Main loop - time based with memory
93
+ while state.should_continue():
94
+ state.step_count += 1
95
+ elapsed = int(state.get_elapsed_time())
96
+ remaining = int(state.get_remaining_time())
97
+
98
+ yield {"type": "status", "message": f"🔍 Step {state.step_count}: Analyzing... ({elapsed}s / {MAX_TIME_SECONDS}s)"}
99
+
100
+ # Take screenshot
101
+ screenshot_bytes = desktop.screenshot()
102
+ screenshot_b64 = base64.b64encode(screenshot_bytes).decode('utf-8')
103
+
104
+ # Get page content
105
+ current_url = state.visited_urls[-1]
106
+ page_content = ""
107
+
108
+ try:
109
+ result = desktop.commands.run(
110
+ f"curl -sL --max-time 10 --connect-timeout 5 "
111
+ f"-A 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0' "
112
+ f"{shlex.quote(current_url)} 2>/dev/null | "
113
+ "sed -e 's/<script[^>]*>.*<\\/script>//g' -e 's/<style[^>]*>.*<\\/style>//g' | "
114
+ "sed 's/<[^>]*>//g' | "
115
+ "tr -s ' \\n' ' ' | "
116
+ "head -c 6000",
117
+ timeout=15
118
+ )
119
+ page_content = result.stdout.strip() if hasattr(result, 'stdout') else ""
120
+ state.page_content = page_content
121
+ except Exception as e:
122
+ logger.warning(f"Content extraction failed: {e}")
123
+ state.add_error(f"Content extraction failed: {e}")
124
+
125
+ # Check for Cloudflare block
126
+ is_blocked = is_cloudflare_blocked(page_content) if page_content else False
127
+
128
+ if is_blocked:
129
+ yield {"type": "status", "message": f"🚫 Cloudflare at {current_url[:40]}..., trying next link..."}
130
+ state.add_error(f"Cloudflare blocked: {current_url}")
131
+ else:
132
+ # Add to memory
133
+ state.extracted_data.append({
134
+ "url": current_url,
135
+ "content_length": len(page_content),
136
+ "preview": page_content[:200]
137
+ })
138
+
139
+ # Build prompt with memory context
140
+ memory_context = state.get_context_for_llm()
141
+ history_str = "\n".join([f"- {u}" for u in state.visited_urls[-5:]])
142
+ content_preview = page_content[:2000] if page_content else "(empty page)"
143
+
144
+ prompt = f"""You are a browser agent with memory. Analyze the page and decide the next action.
145
+
146
+ TASK: {task}
147
+ CURRENT URL: {current_url}
148
+ TIME REMAINING: {remaining}s
149
+ STEP: {state.step_count}
150
+
151
+ MEMORY:
152
+ {memory_context}
153
+
154
+ VISITED URLS:
155
+ {history_str}
156
+
157
+ PAGE CONTENT (blocked={is_blocked}):
158
+ {content_preview}
159
+
160
+ What should I do? Reply with JSON:
161
+ {{"action": "SEARCH|NAVIGATE|SCROLL|DONE", "value": "search query or URL", "reason": "brief reason"}}
162
+
163
+ - SEARCH: Search for something new (use if current results are insufficient)
164
+ - NAVIGATE: Go to a specific URL found on the page (MUST be different from visited URLs)
165
+ - SCROLL: Scroll down for more content
166
+ - DONE: Task is complete, provide final answer
167
+
168
+ RULES:
169
+ 1. Do NOT navigate to already visited URLs
170
+ 2. If blocked, navigate to a different link immediately
171
+ 3. If you have enough info, respond with DONE
172
+ 4. Include "answer" field when action is DONE"""
173
+
174
+ response = await generate_completion(
175
+ messages=[{"role": "user", "content": prompt}],
176
+ max_tokens=500
177
+ )
178
+
179
+ # Parse response
180
+ try:
181
+ json_match = response[response.find('{'):response.rfind('}')+1]
182
+ decision = json.loads(json_match)
183
+ except:
184
+ logger.warning(f"Could not parse LLM response: {response[:200]}")
185
+ decision = {"action": "DONE", "answer": response}
186
+
187
+ action = decision.get("action", "DONE")
188
+ value = decision.get("value", "")
189
+ reason = decision.get("reason", "")
190
+
191
+ # Record action in memory
192
+ state.add_action({"type": action.lower(), "value": value, "reason": reason})
193
+
194
+ yield {"type": "status", "message": f"🤔 Action: {action} - {reason[:50]}"}
195
+
196
+ if action == "DONE":
197
+ state.success = True
198
+ final_answer = decision.get("answer", "")
199
+
200
+ if not final_answer:
201
+ # Generate from memory
202
+ all_content = "\n\n".join([
203
+ f"Source: {d['url']}\n{d.get('preview', '')}"
204
+ for d in state.extracted_data[-5:]
205
+ ])
206
+ final_prompt = f"Based on this content, answer: {task}\n\nContent:\n{all_content}"
207
+ final_answer = await generate_completion(
208
+ messages=[{"role": "user", "content": final_prompt}],
209
+ max_tokens=1000
210
+ )
211
+
212
+ state.final_result = final_answer
213
+
214
+ yield {"type": "stream_end", "message": "Done"}
215
+ yield {
216
+ "type": "result",
217
+ "content": final_answer,
218
+ "links": state.visited_urls,
219
+ "steps": state.step_count,
220
+ "success": True
221
+ }
222
+
223
+ yield {"type": "complete", "message": f"Completed in {int(state.get_elapsed_time())}s with {state.step_count} steps"}
224
+ return
225
+
226
+ elif action == "SEARCH":
227
+ search_query = value.replace(' ', '+')
228
+ new_url = f"https://html.duckduckgo.com/html/?q={search_query}"
229
+
230
+ if new_url not in state.visited_urls:
231
+ desktop.commands.run(f"google-chrome {shlex.quote(new_url)} &", background=True)
232
+ desktop.wait(3000)
233
+ state.visited_urls.append(new_url)
234
+
235
+ elif action == "NAVIGATE":
236
+ if value and value.startswith("http"):
237
+ if value in state.visited_urls:
238
+ yield {"type": "status", "message": f"⏭️ Already visited, skipping..."}
239
+ state.add_error(f"Tried to revisit: {value}")
240
+ else:
241
+ desktop.commands.run(f"google-chrome {shlex.quote(value)} &", background=True)
242
+ desktop.wait(3000)
243
+ state.visited_urls.append(value)
244
+
245
+ elif action == "SCROLL":
246
+ desktop.press("pagedown")
247
+ desktop.wait(1500)
248
+
249
+ # Small delay
250
+ desktop.wait(1000)
251
+
252
+ # Timeout - generate from memory
253
+ yield {"type": "status", "message": "⏰ Time limit reached, generating final answer from memory..."}
254
+
255
+ all_content = "\n\n".join([
256
+ f"Source: {d['url']}\n{d.get('preview', '')}"
257
+ for d in state.extracted_data[-5:]
258
+ ])
259
+ final_prompt = f"Based on this content, answer: {task}\n\nContent:\n{all_content}"
260
+ final_answer = await generate_completion(
261
+ messages=[{"role": "user", "content": final_prompt}],
262
+ max_tokens=1000
263
+ )
264
+
265
+ state.final_result = final_answer
266
+
267
+ yield {"type": "stream_end", "message": "Done"}
268
+ yield {
269
+ "type": "result",
270
+ "content": final_answer,
271
+ "links": state.visited_urls,
272
+ "steps": state.step_count,
273
+ "success": True
274
+ }
275
+ yield {"type": "complete", "message": f"Completed in {MAX_TIME_SECONDS}s (timeout) with {state.step_count} steps"}
276
+
277
+ except ImportError as e:
278
+ yield {"type": "error", "message": "e2b-desktop not installed"}
279
+ except Exception as e:
280
+ logger.exception("Browser agent error")
281
+ yield {"type": "error", "message": f"Error: {str(e)}"}
282
+ finally:
283
+ if desktop:
284
+ try:
285
+ desktop.stream.stop()
286
+ except:
287
+ pass
288
+ try:
289
+ desktop.kill()
290
+ except:
291
+ pass
app/agents/browser_agent_v2.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Browser Agent v2 - Uses Camoufox stealth browser inside E2B.
2
+
3
+ Camoufox = Firefox stealth que passa anti-bot.
4
+ Roda DENTRO do E2B sandbox.
5
+ """
6
+
7
+ import os
8
+ import json
9
+ import logging
10
+ import shlex
11
+ import time
12
+ from typing import AsyncGenerator, Optional
13
+
14
+ from app.config import get_settings
15
+ from app.agents.llm_client import generate_completion
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ async def run_browser_agent_v2(
21
+ task: str,
22
+ url: Optional[str] = None,
23
+ ) -> AsyncGenerator[dict, None]:
24
+ """Run browser agent with Camoufox stealth browser inside E2B."""
25
+ settings = get_settings()
26
+
27
+ if not settings.e2b_api_key:
28
+ yield {"type": "error", "message": "E2B_API_KEY not configured"}
29
+ return
30
+
31
+ yield {"type": "status", "message": "🚀 Initializing agent..."}
32
+
33
+ desktop = None
34
+ start_time = time.time()
35
+
36
+ try:
37
+ from e2b_desktop import Sandbox
38
+
39
+ os.environ["E2B_API_KEY"] = settings.e2b_api_key
40
+
41
+ yield {"type": "status", "message": "🖥️ Creating sandbox..."}
42
+ desktop = Sandbox.create(timeout=900)
43
+
44
+ # Stream
45
+ stream_url = None
46
+ try:
47
+ desktop.stream.start(require_auth=True)
48
+ auth_key = desktop.stream.get_auth_key()
49
+ stream_url = desktop.stream.get_url(auth_key=auth_key)
50
+ yield {"type": "stream", "url": stream_url}
51
+ desktop.wait(2000)
52
+ except Exception as e:
53
+ logger.warning(f"Stream failed: {e}")
54
+
55
+ # Install Camoufox in E2B
56
+ yield {"type": "status", "message": "📦 Installing stealth browser (pip)..."}
57
+
58
+ try:
59
+ # Install packages
60
+ desktop.commands.run("pip install --user camoufox playwright -q", timeout=120)
61
+
62
+ yield {"type": "status", "message": "🔽 Downloading Firefox stealth (~30s)..."}
63
+ desktop.commands.run("camoufox fetch", timeout=180)
64
+
65
+ yield {"type": "status", "message": "🔧 Installing browser dependencies..."}
66
+ desktop.commands.run("sudo apt-get update -qq && sudo apt-get install -y -qq libgtk-3-0 libasound2 libdbus-glib-1-2 2>/dev/null || true", timeout=60)
67
+
68
+ yield {"type": "status", "message": "✅ Browser ready!"}
69
+ except Exception as e:
70
+ logger.error(f"Camoufox install failed: {e}")
71
+ yield {"type": "error", "message": f"Install failed: {e}"}
72
+ return
73
+
74
+ # Create and run scraper script
75
+ yield {"type": "status", "message": f"🔍 Searching: {task[:40]}..."}
76
+
77
+ script = _build_script(task, url)
78
+
79
+ # Write script
80
+ desktop.commands.run(
81
+ f"cat > /tmp/scrape.py << 'EOF'\n{script}\nEOF",
82
+ timeout=10
83
+ )
84
+
85
+ yield {"type": "status", "message": "🌐 Navigating with stealth browser..."}
86
+
87
+ # Run
88
+ result = desktop.commands.run("python3 /tmp/scrape.py", timeout=240)
89
+ output = result.stdout.strip() if hasattr(result, 'stdout') else ""
90
+
91
+ if not output:
92
+ yield {"type": "error", "message": "No output from scraper"}
93
+ return
94
+
95
+ # Parse
96
+ try:
97
+ data = json.loads(output)
98
+ content = data.get("content", "")
99
+ urls = data.get("urls", [])
100
+ error = data.get("error")
101
+
102
+ if error:
103
+ yield {"type": "error", "message": error}
104
+ return
105
+ except json.JSONDecodeError:
106
+ content = output[:4000]
107
+ urls = []
108
+
109
+ # Synthesize with LLM
110
+ yield {"type": "status", "message": "✨ Generating response..."}
111
+
112
+ prompt = f"""Analise e responda:
113
+
114
+ PERGUNTA: {task}
115
+
116
+ CONTEÚDO:
117
+ {content[:5000]}
118
+
119
+ Use **negrito** para valores importantes. Seja direto."""
120
+
121
+ response = await generate_completion(
122
+ messages=[{"role": "user", "content": prompt}],
123
+ max_tokens=1200
124
+ )
125
+
126
+ final = response.strip() if response else content[:1000]
127
+
128
+ # Done
129
+ yield {"type": "stream_end", "message": "Done"}
130
+
131
+ yield {
132
+ "type": "result",
133
+ "content": final,
134
+ "links": urls[:10],
135
+ "success": True
136
+ }
137
+
138
+ elapsed = int(time.time() - start_time)
139
+ yield {"type": "complete", "message": f"Done in {elapsed}s"}
140
+
141
+ except ImportError:
142
+ yield {"type": "error", "message": "e2b-desktop not installed"}
143
+ except Exception as e:
144
+ logger.exception("Agent error")
145
+ yield {"type": "error", "message": str(e)}
146
+ finally:
147
+ if desktop:
148
+ try:
149
+ desktop.stream.stop()
150
+ except:
151
+ pass
152
+ try:
153
+ desktop.kill()
154
+ except:
155
+ pass
156
+
157
+
158
+ def _build_script(task: str, url: Optional[str] = None) -> str:
159
+ """Build Python script to run inside E2B with Camoufox."""
160
+
161
+ task_safe = task.replace("'", "\\'").replace('"', '\\"')
162
+ search_url = url or f"https://html.duckduckgo.com/html/?q={task.replace(' ', '+')}"
163
+
164
+ return f'''
165
+ import json
166
+ import sys
167
+
168
+ try:
169
+ from camoufox.sync_api import Camoufox
170
+ except:
171
+ print(json.dumps({{"error": "Camoufox not found"}}))
172
+ sys.exit(1)
173
+
174
+ urls = []
175
+ contents = []
176
+
177
+ def extract(page):
178
+ try:
179
+ return page.evaluate("""() => {{
180
+ document.querySelectorAll('script,style,noscript').forEach(e => e.remove());
181
+ return document.body.innerText || '';
182
+ }}""")[:4000]
183
+ except:
184
+ return ""
185
+
186
+ def is_blocked(text):
187
+ t = text.lower()
188
+ if len(text) < 500:
189
+ blocks = ["checking your browser", "cloudflare", "access denied", "blocked"]
190
+ return any(b in t for b in blocks)
191
+ return False
192
+
193
+ try:
194
+ with Camoufox(headless=True) as browser:
195
+ page = browser.new_page()
196
+
197
+ # Search
198
+ page.goto("{search_url}", timeout=30000)
199
+ page.wait_for_timeout(2000)
200
+ urls.append("{search_url}")
201
+
202
+ content = extract(page)
203
+ if not is_blocked(content):
204
+ contents.append(content)
205
+
206
+ # Get links
207
+ links = page.evaluate("""() => {{
208
+ return Array.from(document.querySelectorAll('a[href^="http"]'))
209
+ .map(a => a.href)
210
+ .filter(h => !h.includes('duckduckgo') && !h.includes('google'))
211
+ .slice(0, 5);
212
+ }}""")
213
+
214
+ # Visit up to 2 links
215
+ for link in links[:2]:
216
+ if link in urls:
217
+ continue
218
+ try:
219
+ page.goto(link, timeout=20000)
220
+ page.wait_for_timeout(1500)
221
+ urls.append(link)
222
+
223
+ c = extract(page)
224
+ if not is_blocked(c):
225
+ contents.append(c)
226
+ except:
227
+ pass
228
+
229
+ result = "\\n\\n---\\n\\n".join(contents)
230
+ print(json.dumps({{"content": result[:8000], "urls": urls}}))
231
+
232
+ except Exception as e:
233
+ print(json.dumps({{"error": str(e)}}))
234
+ '''
app/agents/browser_agent_v3.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Browser Agent v3 - Batch extraction, minimal LLM calls.
2
+
3
+ Flow:
4
+ 1. Search DuckDuckGo → Get top links
5
+ 2. Batch extract content from 3-5 pages (NO LLM calls)
6
+ 3. Send ALL content to LLM in ONE call
7
+ 4. LLM either responds OR requests specific follow-up
8
+
9
+ Target: 2-4 LLM calls max instead of 40+
10
+ """
11
+
12
+ import os
13
+ import re
14
+ import shlex
15
+ import logging
16
+ import time
17
+ from typing import AsyncGenerator, Optional, List, Dict
18
+
19
+ from app.config import get_settings
20
+ from app.agents.llm_client import generate_completion
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Config
25
+ MAX_PAGES_TO_EXTRACT = 4
26
+ TIMEOUT_SECONDS = 300
27
+ CONTENT_PER_PAGE = 2000
28
+
29
+
30
+ async def run_browser_agent_v3(
31
+ task: str,
32
+ url: Optional[str] = None,
33
+ ) -> AsyncGenerator[dict, None]:
34
+ """Run browser agent with batch extraction - minimal LLM calls."""
35
+ settings = get_settings()
36
+
37
+ if not settings.e2b_api_key:
38
+ yield {"type": "error", "message": "E2B_API_KEY not configured"}
39
+ return
40
+
41
+ start_time = time.time()
42
+ yield {"type": "status", "message": "🚀 Initializing agent..."}
43
+
44
+ desktop = None
45
+
46
+ try:
47
+ from e2b_desktop import Sandbox
48
+
49
+ os.environ["E2B_API_KEY"] = settings.e2b_api_key
50
+
51
+ yield {"type": "status", "message": "🖥️ Creating virtual desktop..."}
52
+ desktop = Sandbox.create(timeout=600)
53
+
54
+ # Start streaming
55
+ stream_url = None
56
+ try:
57
+ desktop.stream.start(require_auth=True)
58
+ auth_key = desktop.stream.get_auth_key()
59
+ stream_url = desktop.stream.get_url(auth_key=auth_key)
60
+ yield {"type": "stream", "url": stream_url}
61
+ desktop.wait(2000)
62
+ except Exception as e:
63
+ logger.warning(f"Could not start stream: {e}")
64
+
65
+ # Launch Chrome
66
+ yield {"type": "status", "message": "🌐 Launching browser..."}
67
+ chrome_flags = "--no-sandbox --disable-gpu --start-maximized --no-first-run --disable-default-apps --disable-popup-blocking --disable-translate --no-default-browser-check"
68
+ desktop.commands.run(f"google-chrome {chrome_flags} 'about:blank' &", background=True)
69
+ desktop.wait(3000)
70
+ desktop.press("enter")
71
+ desktop.wait(1000)
72
+
73
+ # Phase 1: Search
74
+ yield {"type": "status", "message": f"🔍 Searching: {task[:50]}..."}
75
+ search_query = task.replace(' ', '+')
76
+ search_url = f"https://html.duckduckgo.com/html/?q={search_query}"
77
+
78
+ desktop.commands.run(f"google-chrome {shlex.quote(search_url)} &", background=True)
79
+ desktop.wait(3000)
80
+
81
+ # Extract search results page
82
+ search_content = await _extract_page_content(desktop, search_url)
83
+
84
+ # Parse links from search results
85
+ links = _extract_links_from_search(search_content, task)
86
+ logger.info(f"Found {len(links)} relevant links")
87
+
88
+ if not links:
89
+ # Fallback: just use search content
90
+ links = [search_url]
91
+
92
+ # Phase 2: Batch extract from top pages
93
+ extracted_pages: List[Dict] = []
94
+
95
+ for i, link in enumerate(links[:MAX_PAGES_TO_EXTRACT]):
96
+ remaining = int(TIMEOUT_SECONDS - (time.time() - start_time))
97
+ if remaining < 30:
98
+ break
99
+
100
+ yield {"type": "status", "message": f"📊 Extracting page {i+1}/{min(len(links), MAX_PAGES_TO_EXTRACT)}... ({remaining}s remaining)"}
101
+
102
+ try:
103
+ desktop.commands.run(f"google-chrome {shlex.quote(link)} &", background=True)
104
+ desktop.wait(2500)
105
+
106
+ content = await _extract_page_content(desktop, link)
107
+ if content and len(content) > 100:
108
+ extracted_pages.append({
109
+ "url": link,
110
+ "content": content[:CONTENT_PER_PAGE]
111
+ })
112
+ logger.info(f"Extracted {len(content)} chars from {link[:50]}")
113
+ except Exception as e:
114
+ logger.warning(f"Failed to extract {link}: {e}")
115
+
116
+ # Phase 3: ONE LLM call with all content
117
+ yield {"type": "status", "message": "🤔 Analyzing all sources..."}
118
+
119
+ # Build context
120
+ pages_context = "\n\n---\n\n".join([
121
+ f"SOURCE {i+1}: {p['url']}\n{p['content']}"
122
+ for i, p in enumerate(extracted_pages)
123
+ ])
124
+
125
+ prompt = f"""Você é um assistente de pesquisa. Analise as fontes abaixo e responda à pergunta.
126
+
127
+ PERGUNTA: {task}
128
+
129
+ FONTES COLETADAS:
130
+ {pages_context if pages_context else "(Nenhum conteúdo extraído)"}
131
+
132
+ INSTRUÇÕES:
133
+ 1. Responda baseado APENAS nas fontes acima
134
+ 2. Use **negrito** para valores importantes (preços, números, nomes)
135
+ 3. Cite as fontes quando possível (ex: "Segundo o site X...")
136
+ 4. Se as fontes não respondem a pergunta, diga isso honestamente
137
+ 5. Seja direto e organizado
138
+
139
+ Responda em português:"""
140
+
141
+ response = await generate_completion(
142
+ messages=[{"role": "user", "content": prompt}],
143
+ max_tokens=1500
144
+ )
145
+
146
+ final_result = response.strip() if response else "Não foi possível gerar resposta."
147
+
148
+ # Yield final result
149
+ yield {"type": "stream_end", "message": "Stream ended"}
150
+
151
+ yield {
152
+ "type": "result",
153
+ "content": final_result,
154
+ "links": [p["url"] for p in extracted_pages],
155
+ "success": True
156
+ }
157
+
158
+ elapsed = int(time.time() - start_time)
159
+ yield {"type": "complete", "message": f"Completed in {elapsed}s with {len(extracted_pages)} sources"}
160
+
161
+ logger.info(f"Agent complete. Sources: {len(extracted_pages)}, Time: {elapsed}s, LLM calls: 1")
162
+
163
+ except ImportError as e:
164
+ yield {"type": "error", "message": "e2b-desktop not installed"}
165
+ except Exception as e:
166
+ logger.exception("Browser agent error")
167
+ yield {"type": "error", "message": f"Error: {str(e)}"}
168
+ finally:
169
+ if desktop:
170
+ try:
171
+ desktop.stream.stop()
172
+ except Exception:
173
+ pass
174
+ try:
175
+ desktop.kill()
176
+ except Exception:
177
+ pass
178
+
179
+
180
+ async def _extract_page_content(desktop, url: str) -> str:
181
+ """Extract text content from a page using curl."""
182
+ try:
183
+ result = desktop.commands.run(
184
+ f"curl -sL --max-time 8 --connect-timeout 5 "
185
+ f"-A 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' "
186
+ f"{shlex.quote(url)} 2>/dev/null | "
187
+ "sed -e 's/<script[^>]*>.*<\\/script>//g' -e 's/<style[^>]*>.*<\\/style>//g' | "
188
+ "sed 's/<[^>]*>//g' | "
189
+ "tr -s ' \\n' ' ' | "
190
+ "head -c 8000",
191
+ timeout=12
192
+ )
193
+ return result.stdout.strip() if hasattr(result, 'stdout') else ""
194
+ except Exception as e:
195
+ logger.warning(f"Extract failed for {url}: {e}")
196
+ return ""
197
+
198
+
199
+ def _extract_links_from_search(content: str, task: str) -> List[str]:
200
+ """Extract relevant links from DuckDuckGo search results."""
201
+ # DuckDuckGo HTML links pattern
202
+ links = []
203
+
204
+ # Find URLs in the content
205
+ url_pattern = r'https?://[^\s<>"\']+[a-zA-Z0-9/]'
206
+ found_urls = re.findall(url_pattern, content)
207
+
208
+ # Filter out search engine URLs and duplicates
209
+ seen = set()
210
+ for url in found_urls:
211
+ # Clean URL
212
+ url = url.rstrip('.,;:)')
213
+
214
+ # Skip search engines, trackers, etc
215
+ skip_domains = ['duckduckgo.com', 'google.com', 'bing.com', 'facebook.com', 'twitter.com', 'instagram.com']
216
+ if any(d in url.lower() for d in skip_domains):
217
+ continue
218
+
219
+ # Skip if already seen
220
+ domain = url.split('/')[2] if len(url.split('/')) > 2 else url
221
+ if domain in seen:
222
+ continue
223
+ seen.add(domain)
224
+
225
+ links.append(url)
226
+
227
+ if len(links) >= 8:
228
+ break
229
+
230
+ return links
app/agents/deep_research.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deep Research Orchestrator.
2
+
3
+ Coordinates the full deep research pipeline:
4
+ 1. Planning (query decomposition)
5
+ 2. Parallel searching (multiple dimensions)
6
+ 3. Report synthesis
7
+ """
8
+
9
+ import asyncio
10
+ import json
11
+ import time
12
+ from typing import AsyncIterator, Optional
13
+
14
+ from app.agents.planner import create_research_plan, ResearchPlan, ResearchDimension
15
+ from app.agents.llm_client import generate_completion_stream
16
+ from app.reranking.pipeline import rerank_results
17
+ from app.config import get_settings
18
+
19
+
20
+ class DimensionResult:
21
+ """Results from researching a single dimension."""
22
+
23
+ def __init__(self, dimension: ResearchDimension):
24
+ self.dimension = dimension
25
+ self.results: list[dict] = []
26
+ self.error: Optional[str] = None
27
+
28
+
29
+ async def run_deep_research(
30
+ query: str,
31
+ max_dimensions: int = 6,
32
+ max_sources_per_dim: int = 5,
33
+ max_total_searches: int = 20,
34
+ ) -> AsyncIterator[str]:
35
+ """
36
+ Run a deep research pipeline with streaming progress.
37
+
38
+ Yields SSE-formatted events as the research progresses.
39
+
40
+ Args:
41
+ query: The research query
42
+ max_dimensions: Maximum dimensions to research
43
+ max_sources_per_dim: Max results per dimension
44
+ max_total_searches: Total Tavily API calls allowed
45
+
46
+ Yields:
47
+ SSE event strings in format: data: {json}\n\n
48
+ """
49
+ start_time = time.perf_counter()
50
+ settings = get_settings()
51
+
52
+ try:
53
+ # === PHASE 1: PLANNING ===
54
+ yield _sse_event("status", {"phase": "planning", "message": "Analyzing query..."})
55
+
56
+ plan = await create_research_plan(query, max_dimensions)
57
+
58
+ yield _sse_event("plan_ready", {
59
+ "refined_query": plan.refined_query,
60
+ "dimensions": [
61
+ {"name": d.name, "description": d.description, "priority": d.priority}
62
+ for d in plan.dimensions
63
+ ],
64
+ "estimated_sources": plan.estimated_sources,
65
+ })
66
+
67
+ # === PHASE 2: PARALLEL SEARCHING ===
68
+ yield _sse_event("status", {"phase": "searching", "message": "Researching dimensions..."})
69
+
70
+ # Distribute search budget across dimensions
71
+ num_dimensions = len(plan.dimensions)
72
+ searches_per_dim = max(1, max_total_searches // num_dimensions)
73
+
74
+ dimension_results: list[DimensionResult] = []
75
+
76
+ # Search dimensions in parallel batches
77
+ for i, dimension in enumerate(plan.dimensions):
78
+ yield _sse_event("dimension_start", {
79
+ "index": i + 1,
80
+ "total": num_dimensions,
81
+ "name": dimension.name,
82
+ "query": dimension.search_query,
83
+ })
84
+
85
+ # Search this dimension
86
+ result = await _search_dimension(
87
+ dimension=dimension,
88
+ max_results=max_sources_per_dim,
89
+ max_searches=searches_per_dim,
90
+ )
91
+ dimension_results.append(result)
92
+
93
+ yield _sse_event("dimension_complete", {
94
+ "index": i + 1,
95
+ "name": dimension.name,
96
+ "results_count": len(result.results),
97
+ "error": result.error,
98
+ })
99
+
100
+ # Small delay to avoid rate limits
101
+ await asyncio.sleep(0.1)
102
+
103
+ # === PHASE 3: SYNTHESIS ===
104
+ yield _sse_event("status", {"phase": "synthesizing", "message": "Generating report..."})
105
+ yield _sse_event("synthesis_start", {})
106
+
107
+ # Stream the report generation
108
+ async for chunk in _synthesize_report_stream(query, plan, dimension_results):
109
+ yield _sse_event("report_chunk", {"content": chunk})
110
+
111
+ # === COMPLETE ===
112
+ total_time = time.perf_counter() - start_time
113
+ total_sources = sum(len(r.results) for r in dimension_results)
114
+
115
+ yield _sse_event("done", {
116
+ "total_sources": total_sources,
117
+ "total_dimensions": num_dimensions,
118
+ "total_time_seconds": round(total_time, 2),
119
+ })
120
+
121
+ except Exception as e:
122
+ yield _sse_event("error", {"message": str(e)})
123
+
124
+
125
+ async def _search_dimension(
126
+ dimension: ResearchDimension,
127
+ max_results: int = 5,
128
+ max_searches: int = 2,
129
+ ) -> DimensionResult:
130
+ """Search a single dimension using the aggregator."""
131
+ from app.sources.aggregator import aggregate_search
132
+
133
+ result = DimensionResult(dimension)
134
+
135
+ try:
136
+ # Use aggregator to search all sources
137
+ all_results = await aggregate_search(
138
+ query=dimension.search_query,
139
+ max_results=max_results + 3, # Get extra for reranking
140
+ include_wikipedia=True,
141
+ )
142
+
143
+ # Light reranking (use embeddings when we have many results from SearXNG)
144
+ if all_results:
145
+ use_embeddings = len(all_results) > 15
146
+ ranked = await rerank_results(
147
+ query=dimension.search_query,
148
+ results=all_results,
149
+ temporal_urgency=0.5,
150
+ max_results=max_results,
151
+ use_embeddings=use_embeddings,
152
+ )
153
+ result.results = ranked
154
+
155
+ except Exception as e:
156
+ result.error = str(e)
157
+
158
+ return result
159
+
160
+
161
+ async def _synthesize_report_stream(
162
+ original_query: str,
163
+ plan: ResearchPlan,
164
+ dimension_results: list[DimensionResult],
165
+ ) -> AsyncIterator[str]:
166
+ """Stream the synthesis of the final report."""
167
+
168
+ # Build context from all dimension results
169
+ context_parts = []
170
+ all_sources = []
171
+ source_index = 1
172
+
173
+ for dr in dimension_results:
174
+ if dr.results:
175
+ context_parts.append(f"\n## {dr.dimension.name}\n")
176
+ for r in dr.results:
177
+ context_parts.append(
178
+ f"[{source_index}] {r.get('title', 'Untitled')}\n"
179
+ f" URL: {r.get('url', '')}\n"
180
+ f" Content: {r.get('content', '')[:400]}...\n"
181
+ )
182
+ all_sources.append({
183
+ "index": source_index,
184
+ "title": r.get("title", ""),
185
+ "url": r.get("url", ""),
186
+ })
187
+ source_index += 1
188
+
189
+ context = "\n".join(context_parts)
190
+
191
+ # Build synthesis prompt
192
+ prompt = f"""You are a research analyst. Create a comprehensive research report based on the gathered information.
193
+
194
+ ORIGINAL QUERY: {original_query}
195
+ REFINED QUERY: {plan.refined_query}
196
+
197
+ RESEARCH DIMENSIONS:
198
+ {', '.join(d.name for d in plan.dimensions)}
199
+
200
+ GATHERED INFORMATION:
201
+ {context}
202
+
203
+ INSTRUCTIONS:
204
+ 1. Write a comprehensive research report in Markdown format
205
+ 2. Start with an Executive Summary (2-3 paragraphs)
206
+ 3. Create a section for each research dimension
207
+ 4. Use citations [1], [2], etc. to reference sources
208
+ 5. Include a Conclusion section
209
+ 6. Be thorough but concise
210
+ 7. Write in the same language as the query
211
+ 8. Use headers (##) to organize sections
212
+
213
+ Generate the report:"""
214
+
215
+ messages = [
216
+ {"role": "system", "content": "You are a research analyst creating detailed reports."},
217
+ {"role": "user", "content": prompt},
218
+ ]
219
+
220
+ try:
221
+ async for chunk in generate_completion_stream(messages, temperature=0.4):
222
+ yield chunk
223
+
224
+ # Append sources at the end
225
+ yield "\n\n---\n\n## Sources\n\n"
226
+ for src in all_sources:
227
+ yield f"[{src['index']}] [{src['title']}]({src['url']})\n"
228
+
229
+ except Exception as e:
230
+ yield f"\n\n**Error generating report:** {e}"
231
+
232
+
233
+ def _sse_event(event_type: str, data: dict) -> str:
234
+ """Format an SSE event."""
235
+ payload = {"type": event_type, **data}
236
+ return f"data: {json.dumps(payload)}\n\n"
app/agents/flaresolverr.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FlareSolverr client for Cloudflare bypass.
2
+
3
+ FlareSolverr uses undetected-chromedriver to solve Cloudflare challenges.
4
+ Must be running at http://localhost:8191 in the E2B sandbox.
5
+ """
6
+
7
+ import logging
8
+ import json
9
+ import shlex
10
+ from typing import Optional, Tuple
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ FLARESOLVERR_URL = "http://localhost:8191/v1"
15
+
16
+
17
+ async def solve_cloudflare(desktop, url: str, timeout: int = 60) -> Tuple[bool, str]:
18
+ """
19
+ Use FlareSolverr to bypass Cloudflare protection.
20
+
21
+ Args:
22
+ desktop: E2B desktop instance
23
+ url: URL to fetch through FlareSolverr
24
+ timeout: Max seconds to wait for solution
25
+
26
+ Returns:
27
+ (success: bool, content: str)
28
+ """
29
+ try:
30
+ # Make request to FlareSolverr - properly escape the JSON payload
31
+ payload = json.dumps({
32
+ "cmd": "request.get",
33
+ "url": url,
34
+ "maxTimeout": timeout * 1000
35
+ })
36
+
37
+ result = desktop.commands.run(
38
+ f"curl -s -X POST {shlex.quote(FLARESOLVERR_URL)} "
39
+ f"-H 'Content-Type: application/json' "
40
+ f"-d {shlex.quote(payload)} 2>/dev/null",
41
+ timeout=timeout + 10
42
+ )
43
+
44
+ if not hasattr(result, 'stdout') or not result.stdout:
45
+ return False, ""
46
+
47
+ response = json.loads(result.stdout)
48
+
49
+ if response.get("status") == "ok":
50
+ solution = response.get("solution", {})
51
+ html = solution.get("response", "")
52
+
53
+ # Strip HTML tags - use base64 to safely pass content
54
+ if html:
55
+ import base64
56
+ html_b64 = base64.b64encode(html[:10000].encode()).decode()
57
+ clean_result = desktop.commands.run(
58
+ f"echo {shlex.quote(html_b64)} | base64 -d | sed 's/<[^>]*>//g' | tr -s ' \\n' ' ' | head -c 6000",
59
+ timeout=5
60
+ )
61
+ content = clean_result.stdout.strip() if hasattr(clean_result, 'stdout') else html[:6000]
62
+ logger.info(f"FlareSolverr solved: {url[:50]}")
63
+ return True, content
64
+
65
+ logger.warning(f"FlareSolverr failed: {response.get('message', 'unknown')}")
66
+ return False, ""
67
+
68
+ except Exception as e:
69
+ logger.warning(f"FlareSolverr error: {e}")
70
+ return False, ""
71
+
72
+
73
+ def is_cloudflare_blocked(content: str) -> bool:
74
+ """Check if page content indicates Cloudflare block.
75
+
76
+ Only returns True for actual Cloudflare challenge pages,
77
+ not just pages that mention Cloudflare.
78
+ """
79
+ content_lower = content.lower()
80
+
81
+ # Must have multiple strong indicators to be considered blocked
82
+ strong_indicators = [
83
+ "checking your browser before accessing",
84
+ "please wait while we verify",
85
+ "ray id:",
86
+ "cloudflare ray id",
87
+ "enable javascript and cookies",
88
+ "attention required! | cloudflare",
89
+ "just a moment...",
90
+ "ddos protection by cloudflare",
91
+ ]
92
+
93
+ # Check for strong indicators (need at least 1)
94
+ has_strong = any(ind in content_lower for ind in strong_indicators)
95
+
96
+ # Also check if content is suspiciously short (challenge pages are small)
97
+ is_short = len(content) < 500
98
+
99
+ # Only block if we have strong indicator AND page is short
100
+ # (real content pages that mention cloudflare will be longer)
101
+ if has_strong and is_short:
102
+ return True
103
+
104
+ # Very specific patterns that are definitely challenge pages
105
+ definite_blocks = [
106
+ "checking if the site connection is secure",
107
+ "please turn javascript on and reload the page",
108
+ "please enable cookies",
109
+ ]
110
+
111
+ return any(block in content_lower for block in definite_blocks)
112
+
113
+
114
+ def is_login_wall(content: str) -> bool:
115
+ """Check if page requires login."""
116
+ login_indicators = [
117
+ "sign in",
118
+ "log in",
119
+ "login",
120
+ "create account",
121
+ "register",
122
+ "enter your password",
123
+ "authentication required",
124
+ ]
125
+
126
+ content_lower = content.lower()
127
+ # Check for login indicators but make sure it's not just a login link
128
+ return sum(1 for ind in login_indicators if ind in content_lower) >= 2
app/agents/graph/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Agent Graph Package
app/agents/graph/nodes.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Graph nodes for the agent execution.
2
+
3
+ Each node represents a step in the agent's decision process:
4
+ - PlanNode: Decomposes the task into subtasks
5
+ - SearchNode: Performs web searches
6
+ - NavigateNode: Navigates to URLs
7
+ - ExtractNode: Extracts content from pages
8
+ - VerifyNode: Verifies if goal is achieved
9
+ - RespondNode: Generates final response
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import shlex
15
+ import base64
16
+ from abc import ABC, abstractmethod
17
+ from typing import Tuple
18
+
19
+ from app.agents.graph.state import AgentState, NodeType
20
+ from app.agents.llm_client import generate_completion
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class BaseNode(ABC):
26
+ """Base class for all graph nodes."""
27
+
28
+ node_type: NodeType = NodeType.START
29
+
30
+ @abstractmethod
31
+ async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
32
+ """Execute the node logic and return updated state + next node."""
33
+ pass
34
+
35
+
36
+ class PlanNode(BaseNode):
37
+ """Decomposes task into subtasks."""
38
+
39
+ node_type = NodeType.PLAN
40
+
41
+ async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
42
+ prompt = f"""Você é um planejador de tarefas. Decomponha a tarefa em passos simples.
43
+
44
+ TAREFA: {state.task}
45
+ URL inicial: {state.url or 'Nenhuma - começar com busca'}
46
+
47
+ Responda com JSON:
48
+ {{
49
+ "goal": "objetivo principal",
50
+ "steps": [
51
+ {{"action": "search", "query": "termos de busca"}},
52
+ {{"action": "navigate", "description": "onde navegar"}},
53
+ {{"action": "extract", "what": "o que extrair"}}
54
+ ],
55
+ "success_criteria": "critério de sucesso"
56
+ }}
57
+
58
+ Responda APENAS o JSON, sem explicação."""
59
+
60
+ try:
61
+ response = await generate_completion(
62
+ messages=[{"role": "user", "content": prompt}],
63
+ max_tokens=500
64
+ )
65
+
66
+ # Parse JSON
67
+ response = response.strip()
68
+ if response.startswith("```"):
69
+ response = response.split("```")[1]
70
+ if response.startswith("json"):
71
+ response = response[4:]
72
+
73
+ plan = json.loads(response)
74
+ state.plan = plan
75
+ logger.info(f"Plan created: {plan.get('goal', 'No goal')}")
76
+
77
+ # Decide next node based on plan
78
+ if plan.get("steps") and plan["steps"][0].get("action") == "navigate" and state.url:
79
+ return state, NodeType.NAVIGATE
80
+ return state, NodeType.SEARCH
81
+
82
+ except Exception as e:
83
+ logger.error(f"Planning failed: {e}")
84
+ state.add_error(f"Planning failed: {e}")
85
+ # Fallback to search
86
+ state.plan = {"goal": state.task, "steps": [{"action": "search", "query": state.task}]}
87
+ return state, NodeType.SEARCH
88
+
89
+
90
+ class SearchNode(BaseNode):
91
+ """Performs web search."""
92
+
93
+ node_type = NodeType.SEARCH
94
+
95
+ async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
96
+ desktop = state.desktop
97
+
98
+ # Determine search query
99
+ query = state.task
100
+ if state.plan.get("steps"):
101
+ for step in state.plan["steps"]:
102
+ if step.get("action") == "search" and step.get("query"):
103
+ query = step["query"]
104
+ break
105
+
106
+ # Execute search
107
+ search_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
108
+
109
+ try:
110
+ desktop.commands.run(f"google-chrome {shlex.quote(search_url)} &", background=True)
111
+ state.visited_urls.append(search_url)
112
+ desktop.wait(3000)
113
+
114
+ state.add_action({"type": "search", "query": query})
115
+ logger.info(f"Searched: {query}")
116
+
117
+ return state, NodeType.EXTRACT
118
+
119
+ except Exception as e:
120
+ state.add_error(f"Search failed: {e}")
121
+ return state, NodeType.VERIFY
122
+
123
+
124
+ class NavigateNode(BaseNode):
125
+ """Navigates to a URL."""
126
+
127
+ node_type = NodeType.NAVIGATE
128
+
129
+ async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
130
+ desktop = state.desktop
131
+
132
+ # Get URL to navigate
133
+ url = state.url
134
+ if not url and state.extracted_data:
135
+ # Try to get URL from extracted links
136
+ last_data = state.extracted_data[-1]
137
+ if "links" in last_data.get("data", {}):
138
+ links = last_data["data"]["links"]
139
+ if links:
140
+ url = links[0]
141
+
142
+ if not url:
143
+ return state, NodeType.SEARCH
144
+
145
+ try:
146
+ desktop.commands.run(f"google-chrome {shlex.quote(url)} &", background=True)
147
+ if url not in state.visited_urls:
148
+ state.visited_urls.append(url)
149
+ desktop.wait(3000)
150
+
151
+ state.add_action({"type": "navigate", "url": url})
152
+ logger.info(f"Navigated to: {url[:50]}")
153
+
154
+ return state, NodeType.EXTRACT
155
+
156
+ except Exception as e:
157
+ state.add_error(f"Navigation failed: {e}")
158
+ return state, NodeType.SEARCH
159
+
160
+
161
+ class ExtractNode(BaseNode):
162
+ """Extracts content from current page."""
163
+
164
+ node_type = NodeType.EXTRACT
165
+
166
+ async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
167
+ desktop = state.desktop
168
+ current_url = state.visited_urls[-1] if state.visited_urls else ""
169
+
170
+ try:
171
+ # Get window title
172
+ result = desktop.commands.run("xdotool getactivewindow getwindowname 2>/dev/null", timeout=5)
173
+ state.window_title = result.stdout.strip() if hasattr(result, 'stdout') else ""
174
+
175
+ # Extract page content via curl
176
+ if current_url.startswith("http"):
177
+ result = desktop.commands.run(
178
+ f"curl -sL --max-time 10 --connect-timeout 5 "
179
+ f"-A 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' "
180
+ f"'{current_url}' 2>/dev/null | "
181
+ "sed -e 's/<script[^>]*>.*<\\/script>//g' -e 's/<style[^>]*>.*<\\/style>//g' | "
182
+ "sed 's/<[^>]*>//g' | "
183
+ "tr -s ' \\n' ' ' | "
184
+ "head -c 6000",
185
+ timeout=15
186
+ )
187
+ state.page_content = result.stdout.strip() if hasattr(result, 'stdout') else ""
188
+
189
+ state.add_action({"type": "extract", "content_length": len(state.page_content)})
190
+ logger.info(f"Extracted {len(state.page_content)} chars from {current_url[:50]}")
191
+
192
+ return state, NodeType.VERIFY
193
+
194
+ except Exception as e:
195
+ state.add_error(f"Extraction failed: {e}")
196
+ return state, NodeType.VERIFY
197
+
198
+
199
+ class VerifyNode(BaseNode):
200
+ """Verifies if goal is achieved and decides next action."""
201
+
202
+ node_type = NodeType.VERIFY
203
+
204
+ async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
205
+ context = state.get_context_for_llm()
206
+ page_preview = state.page_content[:4000] if state.page_content else "(No content)"
207
+
208
+ prompt = f"""Você é um agente de navegação web. Analise o conteúdo e decida o próximo passo.
209
+
210
+ TAREFA: {state.task}
211
+ PLANO: {state.plan.get('goal', 'Nenhum')}
212
+ CRITÉRIO DE SUCESSO: {state.plan.get('success_criteria', 'Encontrar a informação pedida')}
213
+
214
+ HISTÓRICO:
215
+ {context}
216
+
217
+ CONTEÚDO DA PÁGINA ATUAL:
218
+ {page_preview}
219
+
220
+ TEMPO RESTANTE: {int(state.get_remaining_time())}s
221
+
222
+ Decida:
223
+ 1. Se encontrou a resposta, retorne: {{"status": "complete", "result": "Sua resposta formatada com **negrito** para valores importantes"}}
224
+ 2. Se precisa buscar mais, retorne: {{"action": "search", "query": "nova busca"}}
225
+ 3. Se precisa navegar para um link, retorne: {{"action": "navigate", "url": "https://..."}}
226
+ 4. Se precisa rolar a página, retorne: {{"action": "scroll"}}
227
+
228
+ REGRAS:
229
+ - Use **negrito** para preços e valores importantes
230
+ - Cite as fontes
231
+ - Se página pede login, tente outra fonte
232
+ - Seja eficiente
233
+
234
+ Responda APENAS com JSON válido."""
235
+
236
+ try:
237
+ response = await generate_completion(
238
+ messages=[{"role": "user", "content": prompt}],
239
+ max_tokens=800
240
+ )
241
+
242
+ # Parse response
243
+ response = response.strip()
244
+ if response.startswith("```"):
245
+ response = response.split("```")[1]
246
+ if response.startswith("json"):
247
+ response = response[4:]
248
+
249
+ decision = json.loads(response)
250
+ state.add_action({"type": "verify", "decision": decision})
251
+
252
+ # Route based on decision
253
+ if decision.get("status") == "complete":
254
+ state.final_result = decision.get("result", "")
255
+ state.success = True
256
+ logger.info("Goal achieved!")
257
+ return state, NodeType.RESPOND
258
+
259
+ action = decision.get("action", "")
260
+ if action == "search":
261
+ # Update plan with new search
262
+ state.plan["steps"] = [{"action": "search", "query": decision.get("query", state.task)}]
263
+ return state, NodeType.SEARCH
264
+ elif action == "navigate":
265
+ state.url = decision.get("url", "")
266
+ return state, NodeType.NAVIGATE
267
+ elif action == "scroll":
268
+ state.desktop.scroll(-3)
269
+ state.desktop.wait(1000)
270
+ return state, NodeType.EXTRACT
271
+
272
+ # Default: try another search
273
+ return state, NodeType.SEARCH
274
+
275
+ except Exception as e:
276
+ logger.error(f"Verify failed: {e}")
277
+ state.add_error(f"Verify failed: {e}")
278
+
279
+ # If we have some content, try to respond anyway
280
+ if state.get_remaining_time() < 30:
281
+ return state, NodeType.RESPOND
282
+ return state, NodeType.SEARCH
283
+
284
+
285
+ class RespondNode(BaseNode):
286
+ """Generates final response."""
287
+
288
+ node_type = NodeType.RESPOND
289
+
290
+ async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
291
+ # If we already have a result, we're done
292
+ if state.final_result:
293
+ state.success = True
294
+ return state, NodeType.RESPOND
295
+
296
+ # Generate response from collected data
297
+ context = state.get_context_for_llm()
298
+ page_content = state.page_content[:3000] if state.page_content else "(Nenhum conteúdo extraído)"
299
+
300
+ prompt = f"""Você realizou uma tarefa de navegação web. Sintetize os resultados.
301
+
302
+ TAREFA: {state.task}
303
+
304
+ DADOS COLETADOS:
305
+ {context}
306
+
307
+ ÚLTIMO CONTEÚDO DA PÁGINA:
308
+ {page_content}
309
+
310
+ URLs VISITADAS:
311
+ {chr(10).join(state.visited_urls[:5]) if state.visited_urls else '(Nenhuma)'}
312
+
313
+ INSTRUÇÕES:
314
+ - Gere uma resposta útil baseada no que foi encontrado
315
+ - Use **negrito** para valores importantes (preços, números, nomes)
316
+ - Cite as fontes quando possível
317
+ - Se não encontrou o que foi pedido, explique o que encontrou ou diga honestamente que não encontrou
318
+
319
+ Responda em português de forma clara e organizada."""
320
+
321
+ try:
322
+ response = await generate_completion(
323
+ messages=[{"role": "user", "content": prompt}],
324
+ max_tokens=1000
325
+ )
326
+ state.final_result = response.strip()
327
+ state.success = bool(state.final_result)
328
+ logger.info(f"Generated response: {len(state.final_result)} chars")
329
+
330
+ except Exception as e:
331
+ logger.error(f"Response generation failed: {e}")
332
+ # Fallback: create response from available data
333
+ if state.page_content:
334
+ state.final_result = f"**Informação encontrada:**\n\n{state.page_content[:500]}...\n\n*Fonte: {state.visited_urls[-1] if state.visited_urls else 'desconhecida'}*"
335
+ else:
336
+ state.final_result = f"Não foi possível completar a tarefa. Erro: {e}"
337
+
338
+ return state, NodeType.RESPOND
app/agents/graph/runner.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Graph runner - executes the agent graph.
2
+
3
+ The runner orchestrates node execution, manages state transitions,
4
+ and yields status updates for streaming.
5
+
6
+ Uses timeout-based execution instead of fixed iteration count.
7
+ """
8
+
9
+ import logging
10
+ import time
11
+ from typing import AsyncGenerator, Dict, Type
12
+
13
+ from app.agents.graph.state import AgentState, NodeType
14
+ from app.agents.graph.nodes import (
15
+ BaseNode,
16
+ PlanNode,
17
+ SearchNode,
18
+ NavigateNode,
19
+ ExtractNode,
20
+ VerifyNode,
21
+ RespondNode,
22
+ )
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Node registry
27
+ NODE_REGISTRY: Dict[NodeType, Type[BaseNode]] = {
28
+ NodeType.PLAN: PlanNode,
29
+ NodeType.SEARCH: SearchNode,
30
+ NodeType.NAVIGATE: NavigateNode,
31
+ NodeType.EXTRACT: ExtractNode,
32
+ NodeType.VERIFY: VerifyNode,
33
+ NodeType.RESPOND: RespondNode,
34
+ }
35
+
36
+ # Status messages with emojis
37
+ STATUS_MESSAGES = {
38
+ NodeType.PLAN: "🎯 Planning task...",
39
+ NodeType.SEARCH: "🔍 Searching...",
40
+ NodeType.NAVIGATE: "🌐 Navigating...",
41
+ NodeType.EXTRACT: "📊 Extracting content...",
42
+ NodeType.VERIFY: "🤔 Analyzing...",
43
+ NodeType.RESPOND: "✅ Generating response...",
44
+ }
45
+
46
+
47
+ async def run_graph(state: AgentState) -> AsyncGenerator[dict, None]:
48
+ """Run the agent graph and yield status updates.
49
+
50
+ Args:
51
+ state: Initial agent state with task, url, and desktop
52
+
53
+ Yields:
54
+ Status updates and final result
55
+ """
56
+ # Initialize timing
57
+ state.start_time = time.time()
58
+ current_node_type = NodeType.PLAN
59
+ state.current_node = current_node_type
60
+
61
+ logger.info(f"Starting graph execution for task: {state.task[:50]}, timeout: {state.timeout_seconds}s")
62
+
63
+ while state.should_continue():
64
+ state.step_count += 1
65
+ state.current_node = current_node_type
66
+
67
+ # Get node instance
68
+ node_class = NODE_REGISTRY.get(current_node_type)
69
+ if not node_class:
70
+ logger.error(f"Unknown node type: {current_node_type}")
71
+ break
72
+
73
+ node = node_class()
74
+
75
+ # Calculate remaining time
76
+ remaining = int(state.get_remaining_time())
77
+ elapsed = int(state.get_elapsed_time())
78
+
79
+ # Yield status update
80
+ status_msg = STATUS_MESSAGES.get(current_node_type, "Processing...")
81
+ if current_node_type == NodeType.SEARCH and state.plan.get("steps"):
82
+ for step in state.plan["steps"]:
83
+ if step.get("action") == "search":
84
+ status_msg = f"🔍 Searching: {step.get('query', state.task)[:40]}..."
85
+ break
86
+ elif current_node_type == NodeType.NAVIGATE and state.url:
87
+ status_msg = f"🌐 Navigating to {state.url[:40]}..."
88
+
89
+ yield {
90
+ "type": "status",
91
+ "message": f"{status_msg} (step {state.step_count}, {remaining}s remaining)"
92
+ }
93
+
94
+ # Execute node
95
+ try:
96
+ state, next_node_type = await node.execute(state)
97
+ logger.info(f"Step {state.step_count}: {current_node_type.value} -> {next_node_type.value} ({elapsed}s elapsed)")
98
+
99
+ # Check if we're done
100
+ if current_node_type == NodeType.RESPOND:
101
+ break
102
+
103
+ # Transition to next node
104
+ current_node_type = next_node_type
105
+
106
+ except Exception as e:
107
+ logger.exception(f"Node execution failed: {e}")
108
+ state.add_error(str(e))
109
+
110
+ # If running low on time, try to respond
111
+ if state.get_remaining_time() < 30:
112
+ current_node_type = NodeType.RESPOND
113
+ else:
114
+ current_node_type = NodeType.SEARCH
115
+
116
+ # If we timed out without a result, generate one from what we have
117
+ if not state.final_result and not state.success:
118
+ logger.warning("Timeout reached, forcing response generation")
119
+ respond_node = RespondNode()
120
+ state, _ = await respond_node.execute(state)
121
+
122
+ # Yield final result
123
+ yield {
124
+ "type": "result",
125
+ "content": state.final_result,
126
+ "links": state.visited_urls[:10],
127
+ "success": state.success
128
+ }
129
+
130
+ yield {"type": "complete", "message": f"Task completed in {int(state.get_elapsed_time())}s"}
131
+
132
+ logger.info(f"Graph execution complete. Success: {state.success}, Steps: {state.step_count}, Time: {state.get_elapsed_time():.1f}s")
133
+
app/agents/graph/simple_agent.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simplified agent nodes - ONE LLM call per cycle.
2
+
3
+ DAG:
4
+ START → THINK_ACT ←→ EXECUTE → RESPOND
5
+ ↑______________|
6
+
7
+ ThinkAndAct: Analyzes content + decides action in ONE call
8
+ Execute: Runs the action (search, navigate, scroll) - NO LLM
9
+ Respond: Final synthesis
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import shlex
15
+ import time
16
+ from abc import ABC, abstractmethod
17
+ from typing import Tuple, Optional, List
18
+
19
+ from app.agents.llm_client import generate_completion
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class SimpleState:
25
+ """Minimal state for the agent."""
26
+
27
+ def __init__(self, task: str, url: Optional[str], desktop, timeout: float = 300):
28
+ self.task = task
29
+ self.url = url
30
+ self.desktop = desktop
31
+ self.timeout = timeout
32
+ self.start_time = time.time()
33
+
34
+ # Memory - content cache (URL -> content)
35
+ self.content_cache: dict = {} # {url: content}
36
+ self.visited_urls: List[str] = []
37
+ self.action_history: List[str] = []
38
+
39
+ # Accumulated knowledge
40
+ self.findings: List[str] = [] # Key findings extracted
41
+
42
+ # Result
43
+ self.final_result = ""
44
+ self.done = False
45
+
46
+ def elapsed(self) -> float:
47
+ return time.time() - self.start_time
48
+
49
+ def remaining(self) -> float:
50
+ return max(0, self.timeout - self.elapsed())
51
+
52
+ def should_continue(self) -> bool:
53
+ return not self.done and self.remaining() > 20
54
+
55
+ def add_page(self, url: str, content: str):
56
+ """Add page to cache - no duplicate fetching."""
57
+ if url not in self.content_cache:
58
+ self.content_cache[url] = content[:4000]
59
+ if url not in self.visited_urls:
60
+ self.visited_urls.append(url)
61
+
62
+ def get_cached_content(self, url: str) -> Optional[str]:
63
+ """Get content from cache if available."""
64
+ return self.content_cache.get(url)
65
+
66
+ def add_finding(self, finding: str):
67
+ """Add a key finding to memory."""
68
+ if finding and finding not in self.findings:
69
+ self.findings.append(finding)
70
+
71
+ def get_all_content(self) -> str:
72
+ """Get all cached content for final synthesis."""
73
+ parts = []
74
+ for url in self.visited_urls[-5:]:
75
+ content = self.content_cache.get(url, "")
76
+ if content:
77
+ parts.append(f"[{url[:60]}]\n{content[:1500]}")
78
+ return "\n\n---\n\n".join(parts)
79
+
80
+ def get_recent_content(self) -> str:
81
+ """Get last 2 pages content for context."""
82
+ recent_urls = self.visited_urls[-2:] if self.visited_urls else []
83
+ parts = []
84
+ for url in recent_urls:
85
+ content = self.content_cache.get(url, "")
86
+ if content:
87
+ parts.append(f"[{url[:60]}]\n{content[:2000]}")
88
+ return "\n\n---\n\n".join(parts)
89
+
90
+
91
+ async def think_and_act(state: SimpleState) -> Tuple[str, dict]:
92
+ """
93
+ ONE LLM call that analyzes current state and decides next action.
94
+ Returns: (action_type, action_params)
95
+
96
+ Actions:
97
+ - search: {"query": "..."}
98
+ - navigate: {"url": "..."}
99
+ - scroll: {}
100
+ - complete: {"result": "..."}
101
+ """
102
+
103
+ content = state.get_recent_content() or "(No content yet)"
104
+ history = ", ".join(state.action_history[-5:]) if state.action_history else "(starting)"
105
+
106
+ # Memory: show visited URLs so LLM doesn't repeat
107
+ visited = "\n".join([f" - {u[:70]}" for u in state.visited_urls[-10:]]) if state.visited_urls else "(none)"
108
+
109
+ prompt = f"""You are a web research agent. Analyze the current state and decide your next action.
110
+
111
+ TASK: {state.task}
112
+
113
+ ALREADY VISITED (DO NOT visit again):
114
+ {visited}
115
+
116
+ CURRENT PAGE CONTENT:
117
+ {content}
118
+
119
+ HISTORY: {history}
120
+ TIME REMAINING: {int(state.remaining())}s
121
+
122
+ Decide ONE action. Return JSON:
123
+
124
+ If you need to search: {{"action": "search", "query": "search terms"}}
125
+ If you found a NEW relevant link to visit: {{"action": "navigate", "url": "https://..."}}
126
+ If you need to scroll for more content: {{"action": "scroll"}}
127
+ If you have enough info to answer: {{"action": "complete", "result": "Your answer with **bold** for important values. Cite sources."}}
128
+
129
+ RULES:
130
+ - DO NOT navigate to URLs already in "ALREADY VISITED" list
131
+ - Only use URLs you see in the content above
132
+ - If you see the answer, return complete immediately
133
+ - Use **bold** for prices, numbers, names
134
+ - Be efficient - don't repeat searches
135
+
136
+ Return ONLY valid JSON:"""
137
+
138
+ try:
139
+ response = await generate_completion(
140
+ messages=[{"role": "user", "content": prompt}],
141
+ max_tokens=800
142
+ )
143
+
144
+ # Parse JSON
145
+ response = response.strip()
146
+ if response.startswith("```"):
147
+ response = response.split("```")[1]
148
+ if response.startswith("json"):
149
+ response = response[4:]
150
+
151
+ decision = json.loads(response)
152
+ action = decision.get("action", "search")
153
+
154
+ # Safety check: prevent navigating to already visited URL
155
+ if action == "navigate":
156
+ url = decision.get("url", "").rstrip("/")
157
+
158
+ # Check if URL already visited (normalize by removing trailing slash)
159
+ visited_normalized = [u.rstrip("/") for u in state.visited_urls]
160
+ if url in visited_normalized or url in state.visited_urls:
161
+ logger.warning(f"LLM tried to revisit {url}, trying different approach")
162
+
163
+ # If we have good content, finish
164
+ good_content = [c for c in state.content_cache.values()
165
+ if c and c not in ["[BLOCKED]", "[LOGIN_REQUIRED]"]]
166
+ if good_content:
167
+ return "complete", {"result": f"Informação coletada: {state.get_recent_content()[:800]}"}
168
+
169
+ # Otherwise, search with different terms
170
+ return "search", {"query": f"{state.task} site:wikipedia.org OR site:gov.br"}
171
+
172
+ logger.info(f"ThinkAndAct decision: {action}")
173
+ return action, decision
174
+
175
+ except Exception as e:
176
+ logger.error(f"ThinkAndAct failed: {e}")
177
+ # Fallback: if we have content, try to respond
178
+ if state.content_cache:
179
+ return "complete", {"result": f"Based on collected data: {state.get_recent_content()[:500]}"}
180
+ return "search", {"query": state.task}
181
+
182
+
183
+ async def execute_action(state: SimpleState, action: str, params: dict) -> bool:
184
+ """
185
+ Execute action WITHOUT LLM call.
186
+ Uses cache to avoid repeated requests.
187
+ Returns True if should continue, False if done.
188
+ """
189
+ desktop = state.desktop
190
+
191
+ if action == "complete":
192
+ state.final_result = params.get("result", "")
193
+ state.done = True
194
+ return False
195
+
196
+ elif action == "search":
197
+ query = params.get("query", state.task)
198
+ search_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
199
+
200
+ # Check cache first
201
+ cached = state.get_cached_content(search_url)
202
+ if cached:
203
+ logger.info(f"Using cached content for search: {query[:30]}")
204
+ state.action_history.append(f"search(cached):{query[:30]}")
205
+ return True
206
+
207
+ desktop.commands.run(f"google-chrome {shlex.quote(search_url)} &", background=True)
208
+ desktop.wait(3000)
209
+
210
+ content = await _extract_content(desktop, search_url)
211
+ state.add_page(search_url, content)
212
+ state.action_history.append(f"search:{query[:30]}")
213
+
214
+ return True
215
+
216
+ elif action == "navigate":
217
+ url = params.get("url", "")
218
+ if not url.startswith("http"):
219
+ return True # Invalid URL, continue
220
+
221
+ # Check cache first - don't re-fetch
222
+ cached = state.get_cached_content(url)
223
+ if cached:
224
+ logger.info(f"Using cached content for: {url[:50]}")
225
+ state.action_history.append(f"nav(cached):{url[:30]}")
226
+ return True
227
+
228
+ desktop.commands.run(f"google-chrome {shlex.quote(url)} &", background=True)
229
+ desktop.wait(3000)
230
+
231
+ content = await _extract_content(desktop, url)
232
+
233
+ # Check for Cloudflare/bot detection - just skip if blocked
234
+ from app.agents.flaresolverr import is_cloudflare_blocked, is_login_wall
235
+
236
+ if is_cloudflare_blocked(content):
237
+ logger.warning(f"Cloudflare block detected at {url[:50]}, skipping...")
238
+ # Mark as visited so LLM doesn't try again
239
+ if url not in state.visited_urls:
240
+ state.visited_urls.append(url)
241
+ state.content_cache[url] = "[BLOCKED]" # Mark as blocked in cache
242
+ state.action_history.append(f"nav(blocked):{url[:30]}")
243
+ return True
244
+
245
+ if is_login_wall(content):
246
+ logger.warning(f"Login wall detected at {url[:50]}, skipping...")
247
+ # Mark as visited so LLM doesn't try again
248
+ if url not in state.visited_urls:
249
+ state.visited_urls.append(url)
250
+ state.content_cache[url] = "[LOGIN_REQUIRED]" # Mark in cache
251
+ state.action_history.append(f"nav(login_wall):{url[:30]}")
252
+ return True
253
+
254
+ state.add_page(url, content)
255
+ state.action_history.append(f"nav:{url[:30]}")
256
+
257
+ return True
258
+
259
+ elif action == "scroll":
260
+ desktop.scroll(-3)
261
+ desktop.wait(1500)
262
+
263
+ # Update cache for current page with new content
264
+ if state.visited_urls:
265
+ current_url = state.visited_urls[-1]
266
+ content = await _extract_content(desktop, current_url)
267
+ state.content_cache[current_url] = content[:4000] # Update cache
268
+
269
+ state.action_history.append("scroll")
270
+ return True
271
+
272
+ return True
273
+
274
+
275
+ async def _extract_content(desktop, url: str) -> str:
276
+ """Extract page content via curl."""
277
+ try:
278
+ result = desktop.commands.run(
279
+ f"curl -sL --max-time 8 --connect-timeout 5 "
280
+ f"-A 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' "
281
+ f"'{url}' 2>/dev/null | "
282
+ "sed -e 's/<script[^>]*>.*<\\/script>//g' -e 's/<style[^>]*>.*<\\/style>//g' | "
283
+ "sed 's/<[^>]*>//g' | "
284
+ "tr -s ' \\n' ' ' | "
285
+ "head -c 6000",
286
+ timeout=12
287
+ )
288
+ return result.stdout.strip() if hasattr(result, 'stdout') else ""
289
+ except Exception as e:
290
+ logger.warning(f"Extract failed: {e}")
291
+ return ""
292
+
293
+
294
+ async def generate_final_response(state: SimpleState) -> str:
295
+ """Generate response if agent timed out without completing."""
296
+ if state.final_result:
297
+ return state.final_result
298
+
299
+ content = state.get_recent_content()
300
+
301
+ prompt = f"""Based on the research done, answer the question.
302
+
303
+ TASK: {state.task}
304
+
305
+ COLLECTED DATA:
306
+ {content if content else "(No data collected)"}
307
+
308
+ SOURCES VISITED: {', '.join(state.visited_urls[:5]) if state.visited_urls else 'None'}
309
+
310
+ Provide a helpful answer based on what was found. Use **bold** for important values. If you couldn't find the answer, say so honestly.
311
+
312
+ Answer in Portuguese:"""
313
+
314
+ try:
315
+ response = await generate_completion(
316
+ messages=[{"role": "user", "content": prompt}],
317
+ max_tokens=1000
318
+ )
319
+ return response.strip()
320
+ except Exception as e:
321
+ return f"Não foi possível completar a pesquisa. Erro: {e}"
app/agents/graph/state.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Agent state management for graph-based execution.
2
+
3
+ The state is passed between nodes and accumulates information
4
+ throughout the agent's execution.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from typing import Optional, Any
9
+ from enum import Enum
10
+
11
+
12
+ class NodeType(Enum):
13
+ """Types of nodes in the agent graph."""
14
+ START = "start"
15
+ PLAN = "plan"
16
+ SEARCH = "search"
17
+ NAVIGATE = "navigate"
18
+ EXTRACT = "extract"
19
+ VERIFY = "verify"
20
+ RESPOND = "respond"
21
+ ERROR = "error"
22
+
23
+
24
+ @dataclass
25
+ class AgentState:
26
+ """Shared state passed between graph nodes."""
27
+
28
+ # Task info
29
+ task: str = ""
30
+ url: Optional[str] = None
31
+
32
+ # Planning
33
+ plan: dict = field(default_factory=dict)
34
+ current_subtask: int = 0
35
+
36
+ # Execution
37
+ current_node: NodeType = NodeType.START
38
+ step_count: int = 0
39
+ start_time: float = field(default_factory=lambda: 0.0)
40
+ timeout_seconds: float = 300.0 # 5 minutes default
41
+
42
+ # Memory
43
+ visited_urls: list = field(default_factory=list)
44
+ extracted_data: list = field(default_factory=list)
45
+ page_content: str = ""
46
+ window_title: str = ""
47
+
48
+ # History
49
+ action_history: list = field(default_factory=list)
50
+ error_history: list = field(default_factory=list)
51
+
52
+ # Results
53
+ final_result: str = ""
54
+ success: bool = False
55
+
56
+ # Desktop reference (set at runtime)
57
+ desktop: Any = None
58
+
59
+ def add_action(self, action: dict):
60
+ """Add action to history."""
61
+ self.action_history.append({
62
+ "step": self.step_count,
63
+ "node": self.current_node.value,
64
+ "action": action
65
+ })
66
+
67
+ def add_error(self, error: str):
68
+ """Add error to history."""
69
+ self.error_history.append({
70
+ "step": self.step_count,
71
+ "error": error
72
+ })
73
+
74
+ def add_extracted_data(self, source: str, data: dict):
75
+ """Add extracted data from a source."""
76
+ self.extracted_data.append({
77
+ "source": source,
78
+ "url": self.visited_urls[-1] if self.visited_urls else "",
79
+ "data": data
80
+ })
81
+
82
+ def get_context_for_llm(self) -> str:
83
+ """Get formatted context for LLM prompts."""
84
+ context_parts = []
85
+
86
+ if self.action_history:
87
+ recent = self.action_history[-5:]
88
+ context_parts.append("Recent actions:")
89
+ for h in recent:
90
+ context_parts.append(f" - {h['node']}: {h['action']}")
91
+
92
+ if self.extracted_data:
93
+ context_parts.append("\nExtracted data:")
94
+ for d in self.extracted_data:
95
+ context_parts.append(f" - {d['source']}: {d['data']}")
96
+
97
+ if self.error_history:
98
+ context_parts.append("\nErrors encountered:")
99
+ for e in self.error_history[-3:]:
100
+ context_parts.append(f" - {e['error']}")
101
+
102
+ return "\n".join(context_parts)
103
+
104
+ def should_continue(self) -> bool:
105
+ """Check if agent should continue execution based on timeout."""
106
+ import time
107
+ if self.start_time == 0:
108
+ self.start_time = time.time()
109
+
110
+ elapsed = time.time() - self.start_time
111
+ time_ok = elapsed < self.timeout_seconds
112
+
113
+ return (
114
+ not self.success and
115
+ time_ok and
116
+ self.current_node != NodeType.ERROR
117
+ )
118
+
119
+ def get_elapsed_time(self) -> float:
120
+ """Get elapsed time in seconds."""
121
+ import time
122
+ if self.start_time == 0:
123
+ return 0.0
124
+ return time.time() - self.start_time
125
+
126
+ def get_remaining_time(self) -> float:
127
+ """Get remaining time in seconds."""
128
+ return max(0, self.timeout_seconds - self.get_elapsed_time())
app/agents/heavy_search.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Heavy Search Agent.
2
+
3
+ Middle-ground between Quick Search and Deep Research.
4
+ Scrapes full content from top results for richer answers.
5
+ """
6
+
7
+ import json
8
+ import time
9
+ from typing import AsyncIterator
10
+
11
+ from app.agents.llm_client import generate_completion_stream
12
+ from app.sources.aggregator import aggregate_search
13
+ from app.sources.scraper import scrape_multiple_urls
14
+ from app.reranking.pipeline import rerank_results
15
+ from app.temporal.intent_detector import detect_temporal_intent
16
+
17
+
18
+ async def run_heavy_search(
19
+ query: str,
20
+ max_results: int = 15,
21
+ max_scrape: int = 8,
22
+ freshness: str = "any",
23
+ ) -> AsyncIterator[str]:
24
+ """
25
+ Run heavy search with content scraping.
26
+
27
+ Steps:
28
+ 1. Aggregate search from multiple sources
29
+ 2. Rerank results
30
+ 3. Scrape full content from top N results
31
+ 4. Stream synthesized answer
32
+
33
+ Yields:
34
+ SSE event strings
35
+ """
36
+ start_time = time.perf_counter()
37
+
38
+ try:
39
+ # Step 1: Status
40
+ yield _sse_event("status", {"phase": "searching", "message": "Searching multiple sources..."})
41
+
42
+ # Step 2: Aggregate search
43
+ temporal_intent, temporal_urgency = detect_temporal_intent(query)
44
+
45
+ raw_results = await aggregate_search(
46
+ query=query,
47
+ max_results=max_results + 5,
48
+ freshness=freshness,
49
+ include_wikipedia=True,
50
+ )
51
+
52
+ if not raw_results:
53
+ yield _sse_event("error", {"message": "No results found"})
54
+ return
55
+
56
+ yield _sse_event("search_complete", {
57
+ "results_count": len(raw_results),
58
+ "sources": list(set(r.get("source", "unknown") for r in raw_results)),
59
+ })
60
+
61
+ # Step 3: Rerank (use embeddings when we have many results from SearXNG)
62
+ yield _sse_event("status", {"phase": "ranking", "message": "Ranking results..."})
63
+
64
+ # Enable embeddings when we have many results (SearXNG provides volume)
65
+ use_embeddings = len(raw_results) > 20
66
+
67
+ ranked_results = await rerank_results(
68
+ query=query,
69
+ results=raw_results,
70
+ temporal_urgency=temporal_urgency,
71
+ max_results=max_results,
72
+ use_embeddings=use_embeddings,
73
+ )
74
+
75
+ # Step 4: Scrape top results
76
+ yield _sse_event("status", {"phase": "scraping", "message": f"Reading top {max_scrape} sources..."})
77
+
78
+ urls_to_scrape = [r.get("url") for r in ranked_results[:max_scrape] if r.get("url")]
79
+ scraped_content = await scrape_multiple_urls(
80
+ urls=urls_to_scrape,
81
+ max_chars_per_url=4000,
82
+ max_concurrent=3,
83
+ )
84
+
85
+ # Merge scraped content into results
86
+ for result in ranked_results:
87
+ url = result.get("url", "")
88
+ if url in scraped_content and scraped_content[url]:
89
+ result["full_content"] = scraped_content[url]
90
+ result["scraped"] = True
91
+ else:
92
+ result["full_content"] = result.get("content", "")
93
+ result["scraped"] = False
94
+
95
+ scraped_count = sum(1 for r in ranked_results if r.get("scraped"))
96
+ yield _sse_event("scrape_complete", {
97
+ "scraped_count": scraped_count,
98
+ "total": len(urls_to_scrape),
99
+ })
100
+
101
+ # Step 5: Send results
102
+ yield _sse_event("results", {
103
+ "results": [
104
+ {
105
+ "title": r.get("title", ""),
106
+ "url": r.get("url", ""),
107
+ "score": r.get("score", 0),
108
+ "source": r.get("source", ""),
109
+ "scraped": r.get("scraped", False),
110
+ }
111
+ for r in ranked_results
112
+ ],
113
+ "temporal_intent": temporal_intent,
114
+ "temporal_urgency": temporal_urgency,
115
+ })
116
+
117
+ # Step 6: Synthesize answer
118
+ yield _sse_event("status", {"phase": "synthesizing", "message": "Generating answer..."})
119
+ yield _sse_event("answer_start", {})
120
+
121
+ async for chunk in _synthesize_heavy_answer(query, ranked_results, temporal_intent):
122
+ yield _sse_event("answer_chunk", {"content": chunk})
123
+
124
+ # Done
125
+ total_time = time.perf_counter() - start_time
126
+ yield _sse_event("done", {
127
+ "total_sources": len(ranked_results),
128
+ "scraped_sources": scraped_count,
129
+ "total_time_seconds": round(total_time, 2),
130
+ })
131
+
132
+ except Exception as e:
133
+ yield _sse_event("error", {"message": str(e)})
134
+
135
+
136
+ async def _synthesize_heavy_answer(
137
+ query: str,
138
+ results: list[dict],
139
+ temporal_intent: str,
140
+ ) -> AsyncIterator[str]:
141
+ """Synthesize answer from scraped content."""
142
+
143
+ # Build context with full content
144
+ context_parts = []
145
+ for i, r in enumerate(results[:8], 1):
146
+ content = r.get("full_content", r.get("content", ""))[:3000]
147
+ scraped_tag = "[FULL]" if r.get("scraped") else "[SNIPPET]"
148
+
149
+ context_parts.append(
150
+ f"[{i}] {r.get('title', 'Untitled')} {scraped_tag}\n"
151
+ f"URL: {r.get('url', '')}\n"
152
+ f"Content:\n{content}\n"
153
+ )
154
+
155
+ context = "\n---\n".join(context_parts)
156
+
157
+ prompt = f"""You are a research assistant providing comprehensive answers.
158
+
159
+ QUERY: {query}
160
+ TEMPORAL INTENT: {temporal_intent}
161
+
162
+ SOURCES (some with full content [FULL], some with snippets [SNIPPET]):
163
+ {context}
164
+
165
+ INSTRUCTIONS:
166
+ 1. Provide a comprehensive, well-structured answer
167
+ 2. Use information from [FULL] sources more extensively
168
+ 3. Cite sources using [1], [2], etc.
169
+ 4. Write in the same language as the query
170
+ 5. Be thorough but clear
171
+
172
+ Answer:"""
173
+
174
+ messages = [
175
+ {"role": "system", "content": "You are a helpful research assistant."},
176
+ {"role": "user", "content": prompt},
177
+ ]
178
+
179
+ async for chunk in generate_completion_stream(messages, temperature=0.3):
180
+ yield chunk
181
+
182
+ # Add citations
183
+ yield "\n\n---\n**Sources:**\n"
184
+ for i, r in enumerate(results[:8], 1):
185
+ scraped = "📄" if r.get("scraped") else "📋"
186
+ yield f"{scraped} [{i}] [{r.get('title', 'Untitled')}]({r.get('url', '')})\n"
187
+
188
+
189
+ def _sse_event(event_type: str, data: dict) -> str:
190
+ """Format an SSE event."""
191
+ payload = {"type": event_type, **data}
192
+ return f"data: {json.dumps(payload)}\n\n"
app/agents/llm_client.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM client abstraction for multiple providers.
2
+
3
+ Supports Groq and OpenRouter for LLM inference.
4
+ """
5
+
6
+ import httpx
7
+ import json
8
+ from typing import Optional, AsyncIterator
9
+ import asyncio
10
+
11
+ from tenacity import (
12
+ retry,
13
+ stop_after_attempt,
14
+ wait_exponential,
15
+ retry_if_exception_type,
16
+ )
17
+
18
+ from app.config import get_settings
19
+
20
+
21
+ class RetryableError(Exception):
22
+ """Error that should trigger a retry."""
23
+ pass
24
+
25
+
26
+ async def generate_completion(
27
+ messages: list[dict],
28
+ model: Optional[str] = None,
29
+ temperature: float = 0.3,
30
+ max_tokens: int = 2048,
31
+ ) -> str:
32
+ """Generate a completion using the configured LLM provider."""
33
+ settings = get_settings()
34
+ provider = settings.llm_provider
35
+ model = model or settings.llm_model
36
+
37
+ if provider == "groq":
38
+ return await _call_groq(messages, model, temperature, max_tokens)
39
+ elif provider == "openrouter":
40
+ return await _call_openrouter(messages, model, temperature, max_tokens)
41
+ else:
42
+ raise ValueError(f"Unknown LLM provider: {provider}")
43
+
44
+
45
+ @retry(
46
+ stop=stop_after_attempt(3),
47
+ wait=wait_exponential(multiplier=1, min=2, max=10),
48
+ retry=retry_if_exception_type(RetryableError),
49
+ reraise=True,
50
+ )
51
+ async def _call_groq(
52
+ messages: list[dict],
53
+ model: str,
54
+ temperature: float,
55
+ max_tokens: int,
56
+ ) -> str:
57
+ """Call Groq API with retry logic."""
58
+ settings = get_settings()
59
+
60
+ if not settings.groq_api_key:
61
+ raise ValueError("GROQ_API_KEY not configured")
62
+
63
+ try:
64
+ async with httpx.AsyncClient(timeout=60.0) as client:
65
+ response = await client.post(
66
+ "https://api.groq.com/openai/v1/chat/completions",
67
+ headers={
68
+ "Authorization": f"Bearer {settings.groq_api_key}",
69
+ "Content-Type": "application/json",
70
+ },
71
+ json={
72
+ "model": model,
73
+ "messages": messages,
74
+ "temperature": temperature,
75
+ "max_tokens": max_tokens,
76
+ },
77
+ )
78
+
79
+ # Retry on rate limit or server errors
80
+ if response.status_code in (429, 502, 503, 504):
81
+ raise RetryableError(f"Groq error {response.status_code}")
82
+
83
+ response.raise_for_status()
84
+ data = response.json()
85
+
86
+ return data["choices"][0]["message"]["content"]
87
+ except httpx.TimeoutException as e:
88
+ raise RetryableError(f"Groq timeout: {e}")
89
+
90
+
91
+ @retry(
92
+ stop=stop_after_attempt(3),
93
+ wait=wait_exponential(multiplier=1, min=2, max=10),
94
+ retry=retry_if_exception_type(RetryableError),
95
+ reraise=True,
96
+ )
97
+ async def _call_openrouter(
98
+ messages: list[dict],
99
+ model: str,
100
+ temperature: float,
101
+ max_tokens: int,
102
+ ) -> str:
103
+ """Call OpenRouter API with retry logic."""
104
+ settings = get_settings()
105
+
106
+ if not settings.openrouter_api_key:
107
+ raise ValueError("OPENROUTER_API_KEY not configured")
108
+
109
+ headers = {
110
+ "Authorization": f"Bearer {settings.openrouter_api_key}",
111
+ "Content-Type": "application/json",
112
+ "HTTP-Referer": "https://madras1-lancer.hf.space",
113
+ "X-Title": "Lancer Search API",
114
+ }
115
+
116
+ payload = {
117
+ "model": model,
118
+ "messages": messages,
119
+ }
120
+
121
+ try:
122
+ async with httpx.AsyncClient(timeout=120.0) as client:
123
+ response = await client.post(
124
+ "https://openrouter.ai/api/v1/chat/completions",
125
+ headers=headers,
126
+ content=json.dumps(payload),
127
+ )
128
+
129
+ # Retry on rate limit or server errors
130
+ if response.status_code in (429, 502, 503, 504):
131
+ raise RetryableError(f"OpenRouter error {response.status_code}")
132
+
133
+ if response.status_code != 200:
134
+ error_text = response.text
135
+ raise ValueError(f"OpenRouter error {response.status_code}: {error_text}")
136
+
137
+ data = response.json()
138
+ return data["choices"][0]["message"]["content"]
139
+ except httpx.TimeoutException as e:
140
+ raise RetryableError(f"OpenRouter timeout: {e}")
141
+
142
+
143
+ async def generate_completion_stream(
144
+ messages: list[dict],
145
+ model: Optional[str] = None,
146
+ temperature: float = 0.3,
147
+ max_tokens: int = 2048,
148
+ ) -> AsyncIterator[str]:
149
+ """Generate a streaming completion using OpenRouter."""
150
+ settings = get_settings()
151
+ model = model or settings.llm_model
152
+
153
+ if not settings.openrouter_api_key:
154
+ raise ValueError("OPENROUTER_API_KEY not configured")
155
+
156
+ headers = {
157
+ "Authorization": f"Bearer {settings.openrouter_api_key}",
158
+ "Content-Type": "application/json",
159
+ "HTTP-Referer": "https://madras1-lancer.hf.space",
160
+ "X-Title": "Lancer Search API",
161
+ }
162
+
163
+ payload = {
164
+ "model": model,
165
+ "messages": messages,
166
+ "stream": True,
167
+ }
168
+
169
+ async with httpx.AsyncClient(timeout=120.0) as client:
170
+ async with client.stream(
171
+ "POST",
172
+ "https://openrouter.ai/api/v1/chat/completions",
173
+ headers=headers,
174
+ content=json.dumps(payload),
175
+ ) as response:
176
+ if response.status_code != 200:
177
+ error_text = await response.aread()
178
+ raise ValueError(f"OpenRouter streaming error {response.status_code}: {error_text}")
179
+
180
+ async for line in response.aiter_lines():
181
+ if line.startswith("data: "):
182
+ data_str = line[6:]
183
+ if data_str.strip() == "[DONE]":
184
+ break
185
+ try:
186
+ data = json.loads(data_str)
187
+ delta = data.get("choices", [{}])[0].get("delta", {})
188
+ content = delta.get("content", "")
189
+ if content:
190
+ yield content
191
+ except json.JSONDecodeError:
192
+ continue
app/agents/planner.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Research Planner Agent.
2
+
3
+ Decomposes complex queries into multiple research dimensions.
4
+ """
5
+
6
+ import json
7
+ from typing import Optional
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+ from app.agents.llm_client import generate_completion
12
+ from app.config import get_settings
13
+
14
+
15
+ class ResearchDimension(BaseModel):
16
+ """A single dimension/aspect to research."""
17
+
18
+ name: str = Field(..., description="Short name for this dimension")
19
+ description: str = Field(..., description="What this dimension covers")
20
+ search_query: str = Field(..., description="Optimized search query for this dimension")
21
+ priority: int = Field(default=1, ge=1, le=3, description="1=high, 2=medium, 3=low")
22
+
23
+
24
+ class ResearchPlan(BaseModel):
25
+ """Complete research plan with all dimensions."""
26
+
27
+ original_query: str
28
+ refined_query: str = Field(..., description="Clarified version of the query")
29
+ dimensions: list[ResearchDimension]
30
+ estimated_sources: int = Field(default=20)
31
+
32
+
33
+ PLANNER_PROMPT = """You are a research planning assistant. Your job is to decompose a complex query into multiple research dimensions.
34
+
35
+ USER QUERY: {query}
36
+
37
+ INSTRUCTIONS:
38
+ 1. Analyze the query and identify 2-6 key dimensions/aspects that need to be researched
39
+ 2. Each dimension should be distinct and cover a different angle
40
+ 3. Create an optimized search query for each dimension
41
+ 4. Assign priority (1=high, 2=medium, 3=low) based on relevance to the main query
42
+ 5. Respond ONLY with valid JSON, no other text
43
+
44
+ OUTPUT FORMAT:
45
+ {{
46
+ "refined_query": "A clearer version of the user's query",
47
+ "dimensions": [
48
+ {{
49
+ "name": "Short name",
50
+ "description": "What this covers",
51
+ "search_query": "Optimized search query",
52
+ "priority": 1
53
+ }}
54
+ ]
55
+ }}
56
+
57
+ Generate the research plan:"""
58
+
59
+
60
+ async def create_research_plan(
61
+ query: str,
62
+ max_dimensions: int = 6,
63
+ ) -> ResearchPlan:
64
+ """
65
+ Create a research plan by decomposing a query into dimensions.
66
+
67
+ Args:
68
+ query: The user's research query
69
+ max_dimensions: Maximum number of dimensions to generate
70
+
71
+ Returns:
72
+ ResearchPlan with dimensions to investigate
73
+ """
74
+ settings = get_settings()
75
+
76
+ messages = [
77
+ {"role": "system", "content": "You are a research planning assistant. Always respond with valid JSON only."},
78
+ {"role": "user", "content": PLANNER_PROMPT.format(query=query)},
79
+ ]
80
+
81
+ try:
82
+ response = await generate_completion(messages, temperature=0.3)
83
+
84
+ # Parse JSON response
85
+ # Try to extract JSON if there's extra text
86
+ json_start = response.find("{")
87
+ json_end = response.rfind("}") + 1
88
+ if json_start >= 0 and json_end > json_start:
89
+ response = response[json_start:json_end]
90
+
91
+ data = json.loads(response)
92
+
93
+ # Build dimensions
94
+ dimensions = []
95
+ for dim_data in data.get("dimensions", [])[:max_dimensions]:
96
+ dimensions.append(ResearchDimension(
97
+ name=dim_data.get("name", "Unknown"),
98
+ description=dim_data.get("description", ""),
99
+ search_query=dim_data.get("search_query", query),
100
+ priority=dim_data.get("priority", 2),
101
+ ))
102
+
103
+ # Sort by priority
104
+ dimensions.sort(key=lambda d: d.priority)
105
+
106
+ return ResearchPlan(
107
+ original_query=query,
108
+ refined_query=data.get("refined_query", query),
109
+ dimensions=dimensions,
110
+ estimated_sources=len(dimensions) * 5,
111
+ )
112
+
113
+ except (json.JSONDecodeError, KeyError) as e:
114
+ # Fallback: create a simple 2-dimension plan
115
+ return ResearchPlan(
116
+ original_query=query,
117
+ refined_query=query,
118
+ dimensions=[
119
+ ResearchDimension(
120
+ name="Main Research",
121
+ description=f"Primary research on: {query}",
122
+ search_query=query,
123
+ priority=1,
124
+ ),
125
+ ResearchDimension(
126
+ name="Background",
127
+ description=f"Background and context for: {query}",
128
+ search_query=f"{query} background overview",
129
+ priority=2,
130
+ ),
131
+ ],
132
+ estimated_sources=10,
133
+ )
app/agents/synthesizer.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Answer synthesizer agent.
2
+
3
+ Generates a coherent answer from search results with citations.
4
+ """
5
+
6
+ from datetime import datetime
7
+ from typing import Optional, AsyncIterator
8
+
9
+ from app.api.schemas import SearchResult, TemporalContext, Citation
10
+ from app.agents.llm_client import generate_completion, generate_completion_stream
11
+
12
+
13
+ SYNTHESIS_PROMPT = """You are a research assistant that synthesizes information from search results.
14
+
15
+ CURRENT DATE: {current_date}
16
+
17
+ USER QUERY: {query}
18
+
19
+ TEMPORAL CONTEXT:
20
+ - Query intent: {temporal_intent} (the user {intent_explanation})
21
+ - Temporal urgency: {temporal_urgency:.0%} (how important freshness is)
22
+
23
+ SEARCH RESULTS:
24
+ {formatted_results}
25
+
26
+ INSTRUCTIONS:
27
+ 1. Synthesize a comprehensive answer based on the search results
28
+ 2. ALWAYS cite your sources using [1], [2], etc. format
29
+ 3. If the query requires current information, prioritize the most recent results
30
+ 4. If there are conflicting dates or versions mentioned, use the most recent accurate information
31
+ 5. Be concise but thorough
32
+ 6. If information seems outdated compared to current date ({current_date}), note this
33
+ 7. Write in the same language as the query
34
+
35
+ Generate your answer:"""
36
+
37
+
38
+ async def synthesize_answer(
39
+ query: str,
40
+ results: list[SearchResult],
41
+ temporal_context: Optional[TemporalContext] = None,
42
+ ) -> tuple[str, list[Citation]]:
43
+ """
44
+ Synthesize an answer from search results.
45
+
46
+ Args:
47
+ query: Original search query
48
+ results: List of search results to synthesize from
49
+ temporal_context: Temporal analysis context
50
+
51
+ Returns:
52
+ Tuple of (answer_text, citations_list)
53
+ """
54
+ if not results:
55
+ return "No results found to synthesize an answer.", []
56
+
57
+ messages = _build_messages(query, results, temporal_context)
58
+
59
+ try:
60
+ answer = await generate_completion(messages, temperature=0.3)
61
+ except Exception as e:
62
+ # Fallback: return a simple summary without LLM
63
+ answer = f"Error generating synthesis: {e}. Please review the search results directly."
64
+
65
+ # Build citations list
66
+ citations = _build_citations(results)
67
+
68
+ return answer, citations
69
+
70
+
71
+ async def synthesize_answer_stream(
72
+ query: str,
73
+ results: list[SearchResult],
74
+ temporal_context: Optional[TemporalContext] = None,
75
+ ) -> AsyncIterator[str]:
76
+ """
77
+ Synthesize an answer with streaming output.
78
+
79
+ Yields chunks of the answer as they are generated.
80
+
81
+ Args:
82
+ query: Original search query
83
+ results: List of search results to synthesize from
84
+ temporal_context: Temporal analysis context
85
+
86
+ Yields:
87
+ Chunks of the answer text
88
+ """
89
+ if not results:
90
+ yield "No results found to synthesize an answer."
91
+ return
92
+
93
+ messages = _build_messages(query, results, temporal_context)
94
+
95
+ try:
96
+ async for chunk in generate_completion_stream(messages, temperature=0.3):
97
+ yield chunk
98
+ except Exception as e:
99
+ yield f"Error generating synthesis: {e}. Please review the search results directly."
100
+
101
+
102
+ def _build_messages(
103
+ query: str,
104
+ results: list[SearchResult],
105
+ temporal_context: Optional[TemporalContext] = None,
106
+ ) -> list[dict]:
107
+ """Build messages for LLM prompt."""
108
+ # Format results for the prompt
109
+ formatted_results = format_results_for_prompt(results[:10]) # Top 10 only
110
+
111
+ # Prepare temporal context
112
+ current_date = datetime.now().strftime("%Y-%m-%d")
113
+ temporal_intent = "neutral"
114
+ temporal_urgency = 0.5
115
+
116
+ if temporal_context:
117
+ temporal_intent = temporal_context.query_temporal_intent
118
+ temporal_urgency = temporal_context.temporal_urgency
119
+ current_date = temporal_context.current_date
120
+
121
+ # Map intent to explanation
122
+ intent_explanations = {
123
+ "current": "is looking for the most recent/current information",
124
+ "historical": "is interested in historical or background information",
125
+ "neutral": "has no specific temporal preference",
126
+ }
127
+
128
+ prompt = SYNTHESIS_PROMPT.format(
129
+ current_date=current_date,
130
+ query=query,
131
+ temporal_intent=temporal_intent,
132
+ intent_explanation=intent_explanations.get(temporal_intent, ""),
133
+ temporal_urgency=temporal_urgency,
134
+ formatted_results=formatted_results,
135
+ )
136
+
137
+ return [
138
+ {"role": "system", "content": "You are a helpful research assistant."},
139
+ {"role": "user", "content": prompt},
140
+ ]
141
+
142
+
143
+ def _build_citations(results: list[SearchResult]) -> list[Citation]:
144
+ """Build citations list from results."""
145
+ citations = []
146
+ for i, result in enumerate(results[:10], 1):
147
+ citations.append(
148
+ Citation(
149
+ index=i,
150
+ url=result.url,
151
+ title=result.title,
152
+ )
153
+ )
154
+ return citations
155
+
156
+
157
+ def format_results_for_prompt(results: list[SearchResult]) -> str:
158
+ """Format search results for inclusion in the LLM prompt."""
159
+ formatted = []
160
+
161
+ for i, result in enumerate(results, 1):
162
+ date_str = ""
163
+ if result.published_date:
164
+ date_str = f" (Published: {result.published_date.strftime('%Y-%m-%d')})"
165
+
166
+ formatted.append(
167
+ f"[{i}] {result.title}{date_str}\n"
168
+ f" URL: {result.url}\n"
169
+ f" Freshness: {result.freshness_score:.0%} | Authority: {result.authority_score:.0%}\n"
170
+ f" Content: {result.content[:500]}..."
171
+ )
172
+
173
+ return "\n\n".join(formatted)
app/api/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """API routes package."""
app/api/routes/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """API routes package."""
app/api/routes/search.py ADDED
@@ -0,0 +1,579 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search API routes."""
2
+
3
+ import json
4
+ import time
5
+ from datetime import datetime
6
+
7
+ from fastapi import APIRouter, HTTPException, Request
8
+ from fastapi.responses import StreamingResponse
9
+
10
+ from app.api.schemas import (
11
+ SearchRequest,
12
+ SearchResponse,
13
+ SearchResult,
14
+ TemporalContext,
15
+ Citation,
16
+ ErrorResponse,
17
+ DeepResearchRequest,
18
+ BrowseRequest,
19
+ )
20
+ from app.config import get_settings
21
+ from app.temporal.intent_detector import detect_temporal_intent
22
+ from app.temporal.freshness_scorer import calculate_freshness_score
23
+ from app.sources.tavily import search_tavily
24
+ from app.sources.duckduckgo import search_duckduckgo
25
+ from app.reranking.pipeline import rerank_results
26
+ from app.agents.synthesizer import synthesize_answer, synthesize_answer_stream
27
+ from app.middleware.rate_limiter import limiter
28
+
29
+ router = APIRouter()
30
+
31
+
32
+ @router.post(
33
+ "/search",
34
+ response_model=SearchResponse,
35
+ responses={500: {"model": ErrorResponse}},
36
+ summary="Search with AI synthesis",
37
+ description="Perform a search with temporal intelligence and return an AI-synthesized answer.",
38
+ )
39
+ @limiter.limit("30/minute")
40
+ async def search(request: Request, body: SearchRequest) -> SearchResponse:
41
+ """
42
+ Perform an intelligent search with:
43
+ - Temporal intent detection
44
+ - Multi-source search
45
+ - Multi-stage reranking
46
+ - AI-powered answer synthesis
47
+ """
48
+ start_time = time.perf_counter()
49
+ settings = get_settings()
50
+
51
+ try:
52
+ # Step 1: Analyze temporal intent
53
+ temporal_intent, temporal_urgency = detect_temporal_intent(body.query)
54
+
55
+ temporal_context = TemporalContext(
56
+ query_temporal_intent=temporal_intent,
57
+ temporal_urgency=temporal_urgency,
58
+ current_date=datetime.now().strftime("%Y-%m-%d"),
59
+ )
60
+
61
+ # Step 2: Search multiple sources
62
+ raw_results = []
63
+
64
+ # Try Tavily first (best quality)
65
+ if settings.tavily_api_key:
66
+ tavily_results = await search_tavily(
67
+ query=body.query,
68
+ max_results=settings.max_search_results,
69
+ freshness=body.freshness,
70
+ include_domains=body.include_domains,
71
+ exclude_domains=body.exclude_domains,
72
+ )
73
+ raw_results.extend(tavily_results)
74
+
75
+ # Fallback to DuckDuckGo if needed
76
+ if not raw_results:
77
+ ddg_results = await search_duckduckgo(
78
+ query=body.query,
79
+ max_results=settings.max_search_results,
80
+ )
81
+ raw_results.extend(ddg_results)
82
+
83
+ if not raw_results:
84
+ return SearchResponse(
85
+ query=body.query,
86
+ answer="No results found for your query.",
87
+ results=[],
88
+ citations=[],
89
+ temporal_context=temporal_context,
90
+ processing_time_ms=(time.perf_counter() - start_time) * 1000,
91
+ )
92
+
93
+ # Step 3: Apply multi-stage reranking
94
+ ranked_results = await rerank_results(
95
+ query=body.query,
96
+ results=raw_results,
97
+ temporal_urgency=temporal_urgency,
98
+ max_results=body.max_results,
99
+ )
100
+
101
+ # Step 4: Convert to SearchResult models
102
+ search_results = []
103
+ for i, result in enumerate(ranked_results):
104
+ freshness = calculate_freshness_score(result.get("published_date"))
105
+ search_results.append(
106
+ SearchResult(
107
+ title=result.get("title", ""),
108
+ url=result.get("url", ""),
109
+ content=result.get("content", ""),
110
+ score=result.get("score", 0.5),
111
+ published_date=result.get("published_date"),
112
+ freshness_score=freshness,
113
+ authority_score=result.get("authority_score", 0.5),
114
+ )
115
+ )
116
+
117
+ # Step 5: Synthesize answer (if requested)
118
+ answer = None
119
+ citations = []
120
+
121
+ if body.include_answer and search_results:
122
+ answer, citations = await synthesize_answer(
123
+ query=body.query,
124
+ results=search_results,
125
+ temporal_context=temporal_context,
126
+ )
127
+
128
+ processing_time = (time.perf_counter() - start_time) * 1000
129
+
130
+ return SearchResponse(
131
+ query=body.query,
132
+ answer=answer,
133
+ results=search_results,
134
+ citations=citations,
135
+ temporal_context=temporal_context,
136
+ processing_time_ms=processing_time,
137
+ )
138
+
139
+ except Exception as e:
140
+ raise HTTPException(status_code=500, detail=str(e))
141
+
142
+
143
+
144
+ @router.post(
145
+ "/search/raw",
146
+ response_model=SearchResponse,
147
+ summary="Search without synthesis",
148
+ description="Perform a search and return raw results without AI synthesis (faster).",
149
+ )
150
+ @limiter.limit("30/minute")
151
+ async def search_raw(request: Request, body: SearchRequest) -> SearchResponse:
152
+ """Fast search without answer synthesis."""
153
+ body.include_answer = False
154
+ return await search(request, body)
155
+
156
+
157
+ @router.post(
158
+ "/search/stream",
159
+ summary="Search with streaming synthesis",
160
+ description="Perform a search and stream the AI-synthesized answer in real-time using SSE.",
161
+ )
162
+ @limiter.limit("30/minute")
163
+ async def search_stream(request: Request, body: SearchRequest):
164
+ """
165
+ Streaming search with Server-Sent Events.
166
+
167
+ Returns results first, then streams the answer as it's generated.
168
+ """
169
+ settings = get_settings()
170
+
171
+ async def event_generator():
172
+ try:
173
+ # Step 1: Analyze temporal intent
174
+ temporal_intent, temporal_urgency = detect_temporal_intent(body.query)
175
+
176
+ temporal_context = TemporalContext(
177
+ query_temporal_intent=temporal_intent,
178
+ temporal_urgency=temporal_urgency,
179
+ current_date=datetime.now().strftime("%Y-%m-%d"),
180
+ )
181
+
182
+ # Step 2: Search sources
183
+ raw_results = []
184
+
185
+ if settings.tavily_api_key:
186
+ tavily_results = await search_tavily(
187
+ query=body.query,
188
+ max_results=settings.max_search_results,
189
+ freshness=body.freshness,
190
+ include_domains=body.include_domains,
191
+ exclude_domains=body.exclude_domains,
192
+ )
193
+ raw_results.extend(tavily_results)
194
+
195
+ if not raw_results:
196
+ ddg_results = await search_duckduckgo(
197
+ query=body.query,
198
+ max_results=settings.max_search_results,
199
+ )
200
+ raw_results.extend(ddg_results)
201
+
202
+ if not raw_results:
203
+ yield f"data: {json.dumps({'type': 'error', 'content': 'No results found'})}\n\n"
204
+ return
205
+
206
+ # Step 3: Rerank
207
+ ranked_results = await rerank_results(
208
+ query=body.query,
209
+ results=raw_results,
210
+ temporal_urgency=temporal_urgency,
211
+ max_results=body.max_results,
212
+ )
213
+
214
+ # Step 4: Convert to SearchResult models
215
+ search_results = []
216
+ for result in ranked_results:
217
+ freshness = calculate_freshness_score(result.get("published_date"))
218
+ search_results.append(
219
+ SearchResult(
220
+ title=result.get("title", ""),
221
+ url=result.get("url", ""),
222
+ content=result.get("content", ""),
223
+ score=result.get("score", 0.5),
224
+ published_date=result.get("published_date"),
225
+ freshness_score=freshness,
226
+ authority_score=result.get("authority_score", 0.5),
227
+ )
228
+ )
229
+
230
+ # Send results first
231
+ results_data = {
232
+ "type": "results",
233
+ "results": [r.model_dump(mode="json") for r in search_results],
234
+ "temporal_context": temporal_context.model_dump(),
235
+ }
236
+ yield f"data: {json.dumps(results_data)}\n\n"
237
+
238
+ # Step 5: Stream answer
239
+ yield f"data: {json.dumps({'type': 'answer_start'})}\n\n"
240
+
241
+ async for chunk in synthesize_answer_stream(
242
+ query=body.query,
243
+ results=search_results,
244
+ temporal_context=temporal_context,
245
+ ):
246
+ yield f"data: {json.dumps({'type': 'answer_chunk', 'content': chunk})}\n\n"
247
+
248
+ yield f"data: {json.dumps({'type': 'done'})}\n\n"
249
+
250
+ except Exception as e:
251
+ yield f"data: {json.dumps({'type': 'error', 'content': str(e)})}\n\n"
252
+
253
+ return StreamingResponse(
254
+ event_generator(),
255
+ media_type="text/event-stream",
256
+ headers={
257
+ "Cache-Control": "no-cache",
258
+ "Connection": "keep-alive",
259
+ "X-Accel-Buffering": "no",
260
+ },
261
+ )
262
+
263
+
264
+ # === Deep Research Endpoints ===
265
+
266
+ @router.post(
267
+ "/research/deep",
268
+ summary="Deep research with multi-dimensional analysis",
269
+ description="Decompose a query into dimensions, search each in parallel, and generate a comprehensive report.",
270
+ )
271
+ @limiter.limit("5/minute")
272
+ async def deep_research(request: Request, body: DeepResearchRequest):
273
+ """
274
+ Run deep research with streaming progress updates.
275
+
276
+ Returns SSE events:
277
+ - plan_ready: Research plan with dimensions
278
+ - dimension_start/complete: Progress per dimension
279
+ - report_chunk: Streaming report content
280
+ - done: Final summary
281
+ """
282
+ from app.agents.deep_research import run_deep_research
283
+
284
+ return StreamingResponse(
285
+ run_deep_research(
286
+ query=body.query,
287
+ max_dimensions=body.max_dimensions,
288
+ max_sources_per_dim=body.max_sources_per_dim,
289
+ max_total_searches=body.max_total_searches,
290
+ ),
291
+ media_type="text/event-stream",
292
+ headers={
293
+ "Cache-Control": "no-cache",
294
+ "Connection": "keep-alive",
295
+ "X-Accel-Buffering": "no",
296
+ },
297
+ )
298
+
299
+
300
+ @router.post(
301
+ "/search/heavy",
302
+ summary="Heavy search with content scraping",
303
+ description="Search with full content extraction from top sources for richer answers.",
304
+ )
305
+ @limiter.limit("10/minute")
306
+ async def heavy_search(request: Request, body: SearchRequest):
307
+ """
308
+ Heavy search with content scraping.
309
+
310
+ Scrapes full content from top results instead of just snippets,
311
+ providing richer context for answer generation.
312
+ """
313
+ from app.agents.heavy_search import run_heavy_search
314
+
315
+ return StreamingResponse(
316
+ run_heavy_search(
317
+ query=body.query,
318
+ max_results=body.max_results,
319
+ max_scrape=5,
320
+ freshness=body.freshness,
321
+ ),
322
+ media_type="text/event-stream",
323
+ headers={
324
+ "Cache-Control": "no-cache",
325
+ "Connection": "keep-alive",
326
+ "X-Accel-Buffering": "no",
327
+ },
328
+ )
329
+
330
+
331
+ @router.get(
332
+ "/images",
333
+ summary="Search for images",
334
+ description="Search for images related to a query using Brave Image Search.",
335
+ )
336
+ @limiter.limit("60/minute")
337
+ async def image_search(request: Request, query: str, max_results: int = 6):
338
+ """
339
+ Search for images related to a query.
340
+
341
+ Returns a list of image results with thumbnails and source URLs.
342
+ """
343
+ from app.sources.images import search_images
344
+
345
+ if not query:
346
+ raise HTTPException(status_code=400, detail="Query is required")
347
+
348
+ images = await search_images(query=query, max_results=max_results)
349
+
350
+ return {"query": query, "images": images}
351
+
352
+
353
+ # === SearXNG Search (pure - no LLM) ===
354
+
355
+ @router.post(
356
+ "/search/searxng",
357
+ summary="Search using SearXNG + embedding reranking",
358
+ description="Uses SearXNG meta-search with embedding reranking. No LLM synthesis.",
359
+ )
360
+ @limiter.limit("20/minute")
361
+ async def searxng_search(request: Request, body: SearchRequest):
362
+ """
363
+ Search using SearXNG with embedding reranking only.
364
+
365
+ This endpoint uses your SearXNG instance for 50+ results
366
+ and reranks with embeddings. No LLM synthesis.
367
+ """
368
+ import json
369
+ from app.sources.searxng import search_searxng
370
+ from app.reranking.embeddings import compute_bi_encoder_scores
371
+
372
+ async def event_generator():
373
+ try:
374
+ # Step 1: Search SearXNG
375
+ yield f"data: {json.dumps({'type': 'status', 'message': 'Searching SearXNG...'})}\n\n"
376
+
377
+ time_range = {"day": "day", "week": "week", "month": "month"}.get(body.freshness)
378
+ raw_results = await search_searxng(
379
+ query=body.query,
380
+ max_results=50,
381
+ time_range=time_range,
382
+ )
383
+
384
+ if not raw_results:
385
+ yield f"data: {json.dumps({'type': 'error', 'message': 'No results from SearXNG'})}\n\n"
386
+ return
387
+
388
+ yield f"data: {json.dumps({'type': 'searxng_complete', 'count': len(raw_results)})}\n\n"
389
+
390
+ # Step 2: Rerank with embeddings
391
+ yield f"data: {json.dumps({'type': 'status', 'message': 'Reranking with embeddings...'})}\n\n"
392
+
393
+ docs = [f"{r.get('title', '')}. {r.get('content', '')[:500]}" for r in raw_results]
394
+ scores = compute_bi_encoder_scores(body.query, docs)
395
+
396
+ for i, result in enumerate(raw_results):
397
+ result["embedding_score"] = scores[i]
398
+ orig_score = result.get("score", 0.5)
399
+ result["score"] = (scores[i] * 0.7) + (orig_score * 0.3)
400
+
401
+ raw_results.sort(key=lambda x: x["score"], reverse=True)
402
+ final_results = raw_results[:body.max_results]
403
+
404
+ # Step 3: Return results (no LLM)
405
+ yield f"data: {json.dumps({'type': 'results', 'results': [{'title': r.get('title'), 'url': r.get('url'), 'content': r.get('content', '')[:300], 'score': round(r.get('score', 0), 3), 'source': r.get('source')} for r in final_results]})}\n\n"
406
+
407
+ yield f"data: {json.dumps({'type': 'done', 'total_raw': len(raw_results), 'returned': len(final_results)})}\n\n"
408
+
409
+ except Exception as e:
410
+ yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
411
+
412
+ return StreamingResponse(
413
+ event_generator(),
414
+ media_type="text/event-stream",
415
+ headers={
416
+ "Cache-Control": "no-cache",
417
+ "Connection": "keep-alive",
418
+ },
419
+ )
420
+
421
+
422
+ # === Code Search (GitHub, StackOverflow) ===
423
+
424
+ @router.post(
425
+ "/search/code",
426
+ summary="Search code repositories and programming Q&A",
427
+ description="Uses SearXNG with GitHub, StackOverflow, and code-focused engines.",
428
+ )
429
+ @limiter.limit("20/minute")
430
+ async def code_search(request: Request, body: SearchRequest):
431
+ """
432
+ Search for code, programming solutions, and documentation.
433
+ Uses GitHub, StackOverflow, GitLab, and other code-focused engines.
434
+ """
435
+ import json
436
+ from app.sources.searxng import search_searxng
437
+ from app.reranking.embeddings import compute_bi_encoder_scores
438
+
439
+ async def event_generator():
440
+ try:
441
+ yield f"data: {json.dumps({'type': 'status', 'message': 'Searching code repositories...'})}\n\n"
442
+
443
+ # Use code-specific engines
444
+ raw_results = await search_searxng(
445
+ query=body.query,
446
+ max_results=50,
447
+ categories=["it"], # IT category includes code engines
448
+ engines=["github", "stackoverflow", "gitlab", "npm", "pypi", "crates.io", "packagist"],
449
+ )
450
+
451
+ if not raw_results:
452
+ yield f"data: {json.dumps({'type': 'error', 'message': 'No code results found'})}\n\n"
453
+ return
454
+
455
+ yield f"data: {json.dumps({'type': 'search_complete', 'count': len(raw_results)})}\n\n"
456
+
457
+ # Rerank with embeddings
458
+ yield f"data: {json.dumps({'type': 'status', 'message': 'Ranking by relevance...'})}\n\n"
459
+
460
+ docs = [f"{r.get('title', '')}. {r.get('content', '')[:500]}" for r in raw_results]
461
+ scores = compute_bi_encoder_scores(body.query, docs)
462
+
463
+ for i, result in enumerate(raw_results):
464
+ result["embedding_score"] = scores[i]
465
+ orig_score = result.get("score", 0.5)
466
+ result["score"] = (scores[i] * 0.7) + (orig_score * 0.3)
467
+
468
+ raw_results.sort(key=lambda x: x["score"], reverse=True)
469
+ final_results = raw_results[:body.max_results]
470
+
471
+ yield f"data: {json.dumps({'type': 'results', 'results': [{'title': r.get('title'), 'url': r.get('url'), 'content': r.get('content', '')[:300], 'score': round(r.get('score', 0), 3), 'source': r.get('source')} for r in final_results]})}\n\n"
472
+ yield f"data: {json.dumps({'type': 'done', 'total_raw': len(raw_results), 'returned': len(final_results)})}\n\n"
473
+
474
+ except Exception as e:
475
+ yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
476
+
477
+ return StreamingResponse(
478
+ event_generator(),
479
+ media_type="text/event-stream",
480
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
481
+ )
482
+
483
+
484
+ # === Academic Search (arXiv, Google Scholar) ===
485
+
486
+ @router.post(
487
+ "/search/academic",
488
+ summary="Search academic papers and research",
489
+ description="Uses SearXNG with arXiv, Google Scholar, Semantic Scholar, and academic engines.",
490
+ )
491
+ @limiter.limit("20/minute")
492
+ async def academic_search(request: Request, body: SearchRequest):
493
+ """
494
+ Search for academic papers, research, and scientific content.
495
+ Uses arXiv, Google Scholar, Semantic Scholar, PubMed, and other academic engines.
496
+ """
497
+ import json
498
+ from app.sources.searxng import search_searxng
499
+ from app.reranking.embeddings import compute_bi_encoder_scores
500
+
501
+ async def event_generator():
502
+ try:
503
+ yield f"data: {json.dumps({'type': 'status', 'message': 'Searching academic sources...'})}\n\n"
504
+
505
+ # Use academic engines
506
+ raw_results = await search_searxng(
507
+ query=body.query,
508
+ max_results=50,
509
+ categories=["science"],
510
+ engines=["arxiv", "google scholar", "semantic scholar", "pubmed", "base", "crossref"],
511
+ )
512
+
513
+ if not raw_results:
514
+ yield f"data: {json.dumps({'type': 'error', 'message': 'No academic results found'})}\n\n"
515
+ return
516
+
517
+ yield f"data: {json.dumps({'type': 'search_complete', 'count': len(raw_results)})}\n\n"
518
+
519
+ # Rerank with embeddings
520
+ yield f"data: {json.dumps({'type': 'status', 'message': 'Ranking by relevance...'})}\n\n"
521
+
522
+ docs = [f"{r.get('title', '')}. {r.get('content', '')[:500]}" for r in raw_results]
523
+ scores = compute_bi_encoder_scores(body.query, docs)
524
+
525
+ for i, result in enumerate(raw_results):
526
+ result["embedding_score"] = scores[i]
527
+ orig_score = result.get("score", 0.5)
528
+ result["score"] = (scores[i] * 0.7) + (orig_score * 0.3)
529
+
530
+ raw_results.sort(key=lambda x: x["score"], reverse=True)
531
+ final_results = raw_results[:body.max_results]
532
+
533
+ yield f"data: {json.dumps({'type': 'results', 'results': [{'title': r.get('title'), 'url': r.get('url'), 'content': r.get('content', '')[:300], 'score': round(r.get('score', 0), 3), 'source': r.get('source')} for r in final_results]})}\n\n"
534
+ yield f"data: {json.dumps({'type': 'done', 'total_raw': len(raw_results), 'returned': len(final_results)})}\n\n"
535
+
536
+ except Exception as e:
537
+ yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
538
+
539
+ return StreamingResponse(
540
+ event_generator(),
541
+ media_type="text/event-stream",
542
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
543
+ )
544
+
545
+
546
+ # === Browser Agent ===
547
+
548
+ @router.post(
549
+ "/agent/browse",
550
+ summary="Browser agent - navigate and extract from websites",
551
+ description="Uses E2B sandbox. stream_visual=true for Chrome with live video, false for Camoufox stealth.",
552
+ )
553
+ @limiter.limit("10/minute")
554
+ async def browser_agent(request: Request, body: BrowseRequest):
555
+ """
556
+ Browser agent with two modes:
557
+ - stream_visual=true: Chrome with live video stream (5 min timeout)
558
+ - stream_visual=false: Camoufox stealth headless (faster, anti-bot)
559
+ """
560
+
561
+ async def event_generator():
562
+ try:
563
+ if body.stream_visual:
564
+ from app.agents.browser_agent import run_browser_agent
565
+ async for event in run_browser_agent(body.task, body.url):
566
+ yield f"data: {json.dumps(event)}\n\n"
567
+ else:
568
+ from app.agents.browser_agent_v2 import run_browser_agent_v2
569
+ async for event in run_browser_agent_v2(body.task, body.url):
570
+ yield f"data: {json.dumps(event)}\n\n"
571
+ except Exception as e:
572
+ yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
573
+
574
+ return StreamingResponse(
575
+ event_generator(),
576
+ media_type="text/event-stream",
577
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
578
+ )
579
+
app/api/schemas.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic schemas for API request/response models."""
2
+
3
+ from datetime import datetime
4
+ from typing import Literal
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ # === Request Models ===
10
+
11
+ class SearchRequest(BaseModel):
12
+ """Search request payload."""
13
+
14
+ query: str = Field(..., min_length=1, max_length=1000, description="Search query")
15
+ max_results: int = Field(default=10, ge=1, le=50, description="Maximum results to return")
16
+ freshness: Literal["day", "week", "month", "year", "any"] = Field(
17
+ default="any",
18
+ description="Filter results by recency"
19
+ )
20
+ include_domains: list[str] | None = Field(
21
+ default=None,
22
+ description="Only include results from these domains"
23
+ )
24
+ exclude_domains: list[str] | None = Field(
25
+ default=None,
26
+ description="Exclude results from these domains"
27
+ )
28
+ include_answer: bool = Field(
29
+ default=True,
30
+ description="Include AI-generated answer"
31
+ )
32
+
33
+
34
+ # === Response Models ===
35
+
36
+ class Citation(BaseModel):
37
+ """Citation reference for the answer."""
38
+
39
+ index: int = Field(..., description="Citation index (1-based)")
40
+ url: str = Field(..., description="Source URL")
41
+ title: str = Field(..., description="Source title")
42
+
43
+
44
+ class TemporalContext(BaseModel):
45
+ """Temporal metadata about the search."""
46
+
47
+ query_temporal_intent: Literal["current", "historical", "neutral"] = Field(
48
+ ...,
49
+ description="Detected temporal intent of the query"
50
+ )
51
+ temporal_urgency: float = Field(
52
+ ...,
53
+ ge=0.0,
54
+ le=1.0,
55
+ description="How important freshness is for this query (0-1)"
56
+ )
57
+ current_date: str = Field(..., description="Current date for context")
58
+
59
+
60
+ class SearchResult(BaseModel):
61
+ """Individual search result."""
62
+
63
+ title: str = Field(..., description="Result title")
64
+ url: str = Field(..., description="Result URL")
65
+ content: str = Field(..., description="Result content/snippet")
66
+ score: float = Field(..., ge=0.0, le=1.0, description="Overall relevance score")
67
+ published_date: datetime | None = Field(
68
+ default=None,
69
+ description="Publication date if available"
70
+ )
71
+ freshness_score: float = Field(
72
+ default=0.5,
73
+ ge=0.0,
74
+ le=1.0,
75
+ description="How fresh/recent the content is"
76
+ )
77
+ authority_score: float = Field(
78
+ default=0.5,
79
+ ge=0.0,
80
+ le=1.0,
81
+ description="Domain authority/trust score"
82
+ )
83
+
84
+
85
+ class SearchResponse(BaseModel):
86
+ """Complete search response."""
87
+
88
+ query: str = Field(..., description="Original query")
89
+ answer: str | None = Field(
90
+ default=None,
91
+ description="AI-generated answer synthesized from results"
92
+ )
93
+ results: list[SearchResult] = Field(
94
+ default_factory=list,
95
+ description="Ranked search results"
96
+ )
97
+ citations: list[Citation] = Field(
98
+ default_factory=list,
99
+ description="Citations referenced in the answer"
100
+ )
101
+ temporal_context: TemporalContext | None = Field(
102
+ default=None,
103
+ description="Temporal analysis metadata"
104
+ )
105
+ processing_time_ms: float = Field(..., description="Total processing time in milliseconds")
106
+
107
+
108
+ class ErrorResponse(BaseModel):
109
+ """Error response model."""
110
+
111
+ error: str = Field(..., description="Error message")
112
+ detail: str | None = Field(default=None, description="Detailed error information")
113
+
114
+
115
+ # === Deep Research Models ===
116
+
117
+ class DeepResearchRequest(BaseModel):
118
+ """Deep research request payload."""
119
+
120
+ query: str = Field(..., min_length=1, max_length=2000, description="Research query")
121
+ max_dimensions: int = Field(
122
+ default=5,
123
+ ge=2,
124
+ le=8,
125
+ description="Maximum research dimensions to explore"
126
+ )
127
+ max_sources_per_dim: int = Field(
128
+ default=5,
129
+ ge=1,
130
+ le=10,
131
+ description="Maximum sources per dimension"
132
+ )
133
+ max_total_searches: int = Field(
134
+ default=20,
135
+ ge=5,
136
+ le=30,
137
+ description="Maximum total API searches"
138
+ )
139
+
140
+
141
+ # === Browser Agent Models ===
142
+
143
+ class BrowseRequest(BaseModel):
144
+ """Browser agent request payload."""
145
+
146
+ task: str = Field(
147
+ ...,
148
+ min_length=1,
149
+ max_length=2000,
150
+ description="Task description (e.g., 'Get the top 5 headlines')"
151
+ )
152
+ url: str | None = Field(
153
+ default=None,
154
+ description="URL to navigate to"
155
+ )
156
+ stream_visual: bool = Field(
157
+ default=False,
158
+ description="Use Chrome with live video stream (less stealth, but visual)"
159
+ )
app/config.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Application configuration using pydantic-settings."""
2
+
3
+ from functools import lru_cache
4
+ from typing import Literal
5
+
6
+ from pydantic_settings import BaseSettings, SettingsConfigDict
7
+
8
+
9
+ class Settings(BaseSettings):
10
+ """Application settings loaded from environment variables."""
11
+
12
+ model_config = SettingsConfigDict(
13
+ env_file=".env",
14
+ env_file_encoding="utf-8",
15
+ extra="ignore",
16
+ )
17
+
18
+ # API Keys - Search Sources
19
+ tavily_api_key: str = ""
20
+ brave_api_key: str = "" # 2000 free/month
21
+
22
+ # SearXNG (self-hosted meta-search - uses your HF Space by default)
23
+ searxng_url: str = "https://madras1-searxng-space.hf.space"
24
+ serper_api_key: str | None = None
25
+
26
+ # E2B Desktop (cloud browser for browser agent)
27
+ e2b_api_key: str = ""
28
+
29
+ # API Keys - LLM Providers
30
+ groq_api_key: str | None = None
31
+ openrouter_api_key: str | None = None
32
+
33
+ # LLM Configuration
34
+ llm_provider: Literal["groq", "openrouter"] = "openrouter"
35
+ llm_model: str = "stepfun/step-3.5-flash:free"
36
+
37
+ # Reranking Models (lightweight for HF Spaces)
38
+ bi_encoder_model: str = "Madras1/minilm-gooaq-mnr-v5" # Fine-tuned on GooAQ + NQ
39
+ cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L6-v2" # ~90MB
40
+
41
+ # Temporal Settings
42
+ default_freshness_half_life: int = 30 # days
43
+
44
+ # API Settings
45
+ max_search_results: int = 20
46
+ max_final_results: int = 10
47
+
48
+ # Deep Research Settings
49
+ max_research_dimensions: int = 6
50
+ max_tavily_calls_per_research: int = 20
51
+ deep_research_model: str | None = None # Use main model if None
52
+
53
+ @property
54
+ def llm_api_key(self) -> str:
55
+ """Get the appropriate API key based on provider."""
56
+ if self.llm_provider == "groq":
57
+ return self.groq_api_key or ""
58
+ return self.openrouter_api_key or ""
59
+
60
+
61
+ @lru_cache
62
+ def get_settings() -> Settings:
63
+ """Get cached settings instance."""
64
+ return Settings()
app/main.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Lancer API - Main FastAPI application."""
2
+
3
+ from contextlib import asynccontextmanager
4
+ from datetime import datetime
5
+
6
+ from fastapi import FastAPI
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from slowapi import _rate_limit_exceeded_handler
9
+ from slowapi.errors import RateLimitExceeded
10
+
11
+ from app.api.routes import search
12
+ from app.config import get_settings
13
+ from app.middleware.rate_limiter import limiter
14
+
15
+
16
+ @asynccontextmanager
17
+ async def lifespan(app: FastAPI):
18
+ """Application lifespan events."""
19
+ # Startup
20
+ settings = get_settings()
21
+ print(f"🚀 Lancer API starting...")
22
+ print(f" LLM Provider: {settings.llm_provider}")
23
+ print(f" LLM Model: {settings.llm_model}")
24
+ print(f" Rate limiting: enabled")
25
+ yield
26
+ # Shutdown
27
+ print("👋 Lancer API shutting down...")
28
+
29
+
30
+ app = FastAPI(
31
+ title="Lancer Search API",
32
+ description="Advanced AI-powered search API with temporal intelligence",
33
+ version="0.1.0",
34
+ lifespan=lifespan,
35
+ )
36
+
37
+ # Rate limiting
38
+ app.state.limiter = limiter
39
+ app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
40
+
41
+ # CORS middleware
42
+ app.add_middleware(
43
+ CORSMiddleware,
44
+ allow_origins=["*"],
45
+ allow_credentials=True,
46
+ allow_methods=["*"],
47
+ allow_headers=["*"],
48
+ )
49
+
50
+ # Include routers
51
+ app.include_router(search.router, prefix="/api/v1", tags=["search"])
52
+
53
+
54
+ @app.get("/health")
55
+ async def health_check():
56
+ """Health check endpoint."""
57
+ return {
58
+ "status": "healthy",
59
+ "timestamp": datetime.now().isoformat(),
60
+ "version": "0.1.0",
61
+ }
62
+
63
+
64
+ @app.get("/")
65
+ async def root():
66
+ """Root endpoint with API info."""
67
+ return {
68
+ "name": "Lancer Search API",
69
+ "version": "0.1.0",
70
+ "docs": "/docs",
71
+ "health": "/health",
72
+ }
app/middleware/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Middleware package."""
app/middleware/rate_limiter.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Rate limiting middleware using SlowAPI.
2
+
3
+ Provides IP-based rate limiting for all API endpoints.
4
+ """
5
+
6
+ from slowapi import Limiter
7
+ from slowapi.util import get_remote_address
8
+ from slowapi.errors import RateLimitExceeded
9
+ from slowapi.middleware import SlowAPIMiddleware
10
+ from fastapi import Request
11
+ from fastapi.responses import JSONResponse
12
+
13
+
14
+ # Create limiter instance with IP-based key
15
+ limiter = Limiter(
16
+ key_func=get_remote_address,
17
+ default_limits=["100/minute"],
18
+ storage_uri="memory://", # Use memory storage (OK for single instance on HF Spaces)
19
+ )
20
+
21
+
22
+ def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
23
+ """Custom handler for rate limit exceeded errors."""
24
+ return JSONResponse(
25
+ status_code=429,
26
+ content={
27
+ "error": "rate_limit_exceeded",
28
+ "message": f"Rate limit exceeded: {exc.detail}",
29
+ "retry_after": getattr(exc, "retry_after", 60),
30
+ },
31
+ )
32
+
33
+
34
+ # Rate limit decorators for different endpoints
35
+ LIMITS = {
36
+ "search": "30/minute",
37
+ "heavy": "10/minute",
38
+ "deep": "5/minute",
39
+ "images": "60/minute",
40
+ }
41
+
42
+
43
+ def get_limiter():
44
+ """Get the limiter instance for dependency injection."""
45
+ return limiter
app/reranking/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Reranking module."""
app/reranking/authority_scorer.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Domain authority scoring.
2
+
3
+ Assigns trust/authority scores to domains based on known reliable sources.
4
+ """
5
+
6
+ from urllib.parse import urlparse
7
+
8
+
9
+ # High authority domains (trusted sources)
10
+ HIGH_AUTHORITY_DOMAINS = {
11
+ # Academic & Research
12
+ ".edu": 0.9,
13
+ ".gov": 0.9,
14
+ ".ac.uk": 0.85,
15
+
16
+ # Major tech companies
17
+ "github.com": 0.8,
18
+ "stackoverflow.com": 0.8,
19
+ "docs.python.org": 0.85,
20
+ "developer.mozilla.org": 0.85,
21
+ "arxiv.org": 0.9,
22
+
23
+ # Major news sources
24
+ "reuters.com": 0.8,
25
+ "bbc.com": 0.75,
26
+ "nytimes.com": 0.75,
27
+ "theguardian.com": 0.75,
28
+
29
+ # Reference
30
+ "wikipedia.org": 0.7,
31
+ "britannica.com": 0.8,
32
+
33
+ # AI/ML specific
34
+ "openai.com": 0.85,
35
+ "anthropic.com": 0.85,
36
+ "huggingface.co": 0.8,
37
+ "deepmind.google": 0.85,
38
+ "ai.meta.com": 0.8,
39
+
40
+ # Tech publications
41
+ "techcrunch.com": 0.7,
42
+ "wired.com": 0.7,
43
+ "arstechnica.com": 0.75,
44
+ "theverge.com": 0.7,
45
+ }
46
+
47
+ # Low authority patterns (less reliable)
48
+ LOW_AUTHORITY_PATTERNS = [
49
+ "medium.com", # User-generated, variable quality
50
+ "reddit.com", # Forum, variable quality
51
+ "quora.com", # Q&A, variable quality
52
+ "blogspot.com",
53
+ "wordpress.com",
54
+ "tumblr.com",
55
+ ]
56
+
57
+
58
+ def calculate_authority_score(url: str) -> float:
59
+ """
60
+ Calculate domain authority score for a URL.
61
+
62
+ Args:
63
+ url: The URL to score
64
+
65
+ Returns:
66
+ Authority score between 0.0 and 1.0
67
+ """
68
+ if not url:
69
+ return 0.5
70
+
71
+ try:
72
+ parsed = urlparse(url)
73
+ domain = parsed.netloc.lower()
74
+
75
+ # Remove www. prefix
76
+ if domain.startswith("www."):
77
+ domain = domain[4:]
78
+
79
+ # Check for exact domain matches
80
+ for known_domain, score in HIGH_AUTHORITY_DOMAINS.items():
81
+ if domain == known_domain or domain.endswith(known_domain):
82
+ return score
83
+
84
+ # Check for TLD-based authority (.edu, .gov, etc.)
85
+ for tld, score in HIGH_AUTHORITY_DOMAINS.items():
86
+ if tld.startswith(".") and domain.endswith(tld):
87
+ return score
88
+
89
+ # Check for low authority patterns
90
+ for pattern in LOW_AUTHORITY_PATTERNS:
91
+ if pattern in domain:
92
+ return 0.4
93
+
94
+ # Default score for unknown domains
95
+ return 0.5
96
+
97
+ except Exception:
98
+ return 0.5
99
+
100
+
101
+ def get_domain_category(url: str) -> str:
102
+ """
103
+ Get a category label for the domain.
104
+
105
+ Args:
106
+ url: The URL to categorize
107
+
108
+ Returns:
109
+ Category string like "Academic", "News", "Tech", etc.
110
+ """
111
+ if not url:
112
+ return "Unknown"
113
+
114
+ try:
115
+ parsed = urlparse(url)
116
+ domain = parsed.netloc.lower()
117
+
118
+ if ".edu" in domain or ".ac.uk" in domain or "arxiv" in domain:
119
+ return "Academic"
120
+ elif ".gov" in domain:
121
+ return "Government"
122
+ elif any(site in domain for site in ["github", "stackoverflow", "docs."]):
123
+ return "Developer"
124
+ elif any(site in domain for site in ["reuters", "bbc", "nytimes", "cnn", "guardian"]):
125
+ return "News"
126
+ elif any(site in domain for site in ["openai", "anthropic", "huggingface", "deepmind"]):
127
+ return "AI/ML"
128
+ elif "wikipedia" in domain:
129
+ return "Reference"
130
+ else:
131
+ return "General"
132
+
133
+ except Exception:
134
+ return "Unknown"
app/reranking/embeddings.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Embedding-based reranking using sentence-transformers.
2
+
3
+ Provides bi-encoder and cross-encoder reranking for better relevance scoring.
4
+ """
5
+
6
+ from functools import lru_cache
7
+ from typing import Optional
8
+
9
+ import numpy as np
10
+
11
+ from app.config import get_settings
12
+
13
+
14
+ @lru_cache(maxsize=1)
15
+ def get_bi_encoder():
16
+ """Load and cache the bi-encoder model."""
17
+ from sentence_transformers import SentenceTransformer
18
+ settings = get_settings()
19
+ return SentenceTransformer(settings.bi_encoder_model)
20
+
21
+
22
+ @lru_cache(maxsize=1)
23
+ def get_cross_encoder():
24
+ """Load and cache the cross-encoder model."""
25
+ from sentence_transformers import CrossEncoder
26
+ settings = get_settings()
27
+ return CrossEncoder(settings.cross_encoder_model)
28
+
29
+
30
+ def compute_bi_encoder_scores(
31
+ query: str,
32
+ documents: list[str],
33
+ ) -> list[float]:
34
+ """
35
+ Compute semantic similarity scores using bi-encoder.
36
+
37
+ Fast but less accurate than cross-encoder.
38
+ Good for initial filtering of large result sets.
39
+
40
+ Args:
41
+ query: Search query
42
+ documents: List of document texts
43
+
44
+ Returns:
45
+ List of similarity scores (0-1)
46
+ """
47
+ if not documents:
48
+ return []
49
+
50
+ model = get_bi_encoder()
51
+
52
+ # Encode query and documents
53
+ query_embedding = model.encode(query, normalize_embeddings=True)
54
+ doc_embeddings = model.encode(documents, normalize_embeddings=True)
55
+
56
+ # Compute cosine similarities (embeddings are normalized, so dot product = cosine)
57
+ similarities = np.dot(doc_embeddings, query_embedding)
58
+
59
+ # Convert to list and ensure values are in [0, 1]
60
+ scores = [(float(s) + 1) / 2 for s in similarities] # Map from [-1, 1] to [0, 1]
61
+
62
+ return scores
63
+
64
+
65
+ def compute_cross_encoder_scores(
66
+ query: str,
67
+ documents: list[str],
68
+ ) -> list[float]:
69
+ """
70
+ Compute relevance scores using cross-encoder.
71
+
72
+ More accurate than bi-encoder but slower.
73
+ Use after initial filtering for precise ranking.
74
+
75
+ Args:
76
+ query: Search query
77
+ documents: List of document texts
78
+
79
+ Returns:
80
+ List of relevance scores (0-1)
81
+ """
82
+ if not documents:
83
+ return []
84
+
85
+ model = get_cross_encoder()
86
+
87
+ # Create query-document pairs
88
+ pairs = [[query, doc] for doc in documents]
89
+
90
+ # Get scores
91
+ scores = model.predict(pairs)
92
+
93
+ # Normalize to [0, 1] using sigmoid if needed
94
+ min_score = float(np.min(scores))
95
+ max_score = float(np.max(scores))
96
+
97
+ if max_score > min_score:
98
+ normalized = [(float(s) - min_score) / (max_score - min_score) for s in scores]
99
+ else:
100
+ normalized = [0.5] * len(scores)
101
+
102
+ return normalized
app/reranking/pipeline.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Multi-stage reranking pipeline.
2
+
3
+ Implements a 3-stage reranking approach:
4
+ 1. Bi-Encoder: Fast semantic similarity (for large result sets)
5
+ 2. Cross-Encoder: Accurate relevance scoring
6
+ 3. Temporal + Authority: Freshness and domain trust weighting
7
+ """
8
+
9
+ import logging
10
+ from typing import Optional
11
+
12
+ from app.temporal.freshness_scorer import calculate_freshness_score, adjust_score_by_freshness
13
+ from app.reranking.authority_scorer import calculate_authority_score
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Flag to enable/disable embedding-based reranking
18
+ ENABLE_EMBEDDING_RERANKING = True
19
+
20
+
21
+ async def rerank_results(
22
+ query: str,
23
+ results: list[dict],
24
+ temporal_urgency: float = 0.5,
25
+ max_results: int = 10,
26
+ use_embeddings: bool = True,
27
+ ) -> list[dict]:
28
+ """
29
+ Apply multi-stage reranking to search results.
30
+
31
+ Pipeline:
32
+ 1. Bi-encoder: Quick semantic filtering (if results > 20)
33
+ 2. Cross-encoder: Precise relevance scoring (top candidates)
34
+ 3. Temporal + Authority: Freshness and trust weighting
35
+
36
+ Args:
37
+ query: Original search query
38
+ results: Raw search results
39
+ temporal_urgency: How important freshness is (0-1)
40
+ max_results: Maximum results to return
41
+ use_embeddings: Whether to use embedding models
42
+
43
+ Returns:
44
+ Reranked results with updated scores
45
+ """
46
+ if not results:
47
+ return []
48
+
49
+ scored_results = results.copy()
50
+
51
+ # Stage 1 & 2: Embedding-based reranking
52
+ if use_embeddings and ENABLE_EMBEDDING_RERANKING:
53
+ try:
54
+ scored_results = await _apply_embedding_reranking(query, scored_results)
55
+ logger.info(f"Applied embedding reranking to {len(scored_results)} results")
56
+ except Exception as e:
57
+ logger.warning(f"Embedding reranking failed, using fallback: {e}")
58
+ # Fall through to basic scoring
59
+
60
+ # Stage 3: Apply temporal + authority scoring
61
+ for result in scored_results:
62
+ # Calculate freshness score
63
+ freshness = calculate_freshness_score(result.get("published_date"))
64
+ result["freshness_score"] = freshness
65
+
66
+ # Calculate authority score
67
+ authority = calculate_authority_score(result.get("url", ""))
68
+ result["authority_score"] = authority
69
+
70
+ # Get base score (from search source or embedding)
71
+ base_score = result.get("score", 0.5)
72
+
73
+ # Adjust for freshness based on temporal urgency
74
+ adjusted_score = adjust_score_by_freshness(
75
+ base_score=base_score,
76
+ freshness_score=freshness,
77
+ temporal_urgency=temporal_urgency,
78
+ )
79
+
80
+ # Also factor in authority (10% weight)
81
+ final_score = (adjusted_score * 0.9) + (authority * 0.1)
82
+ result["score"] = final_score
83
+
84
+ # Sort by final score (descending)
85
+ scored_results.sort(key=lambda x: x["score"], reverse=True)
86
+
87
+ return scored_results[:max_results]
88
+
89
+
90
+ async def _apply_embedding_reranking(
91
+ query: str,
92
+ results: list[dict],
93
+ ) -> list[dict]:
94
+ """Apply bi-encoder and cross-encoder reranking."""
95
+ from app.reranking.embeddings import compute_bi_encoder_scores, compute_cross_encoder_scores
96
+
97
+ # Extract document contents for embedding
98
+ documents = [
99
+ f"{r.get('title', '')}. {r.get('content', '')[:500]}"
100
+ for r in results
101
+ ]
102
+
103
+ # Stage 1: Bi-encoder for initial scoring (fast)
104
+ if len(results) > 15:
105
+ bi_scores = compute_bi_encoder_scores(query, documents)
106
+ for i, result in enumerate(results):
107
+ result["bi_encoder_score"] = bi_scores[i]
108
+
109
+ # Sort by bi-encoder and keep top 15 for cross-encoder
110
+ results.sort(key=lambda x: x.get("bi_encoder_score", 0), reverse=True)
111
+ results = results[:15]
112
+ documents = documents[:15]
113
+
114
+ # Stage 2: Cross-encoder for precise scoring (slower but accurate)
115
+ cross_scores = compute_cross_encoder_scores(query, documents)
116
+
117
+ for i, result in enumerate(results):
118
+ # Blend cross-encoder score with original source score
119
+ original_score = result.get("score", 0.5)
120
+ cross_score = cross_scores[i]
121
+
122
+ # Cross-encoder gets 70% weight, original 30%
123
+ result["score"] = (cross_score * 0.7) + (original_score * 0.3)
124
+ result["cross_encoder_score"] = cross_score
125
+
126
+ return results
127
+
app/sources/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Search sources module."""
app/sources/aggregator.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Multi-source search aggregator.
2
+
3
+ Combines results from multiple search sources in parallel.
4
+ """
5
+
6
+ import asyncio
7
+ from typing import Optional
8
+ from urllib.parse import urlparse
9
+
10
+ from app.config import get_settings
11
+ from app.sources.tavily import search_tavily
12
+ from app.sources.brave import search_brave
13
+ from app.sources.duckduckgo import search_duckduckgo
14
+ from app.sources.wikipedia import search_wikipedia
15
+ from app.sources.searxng import search_searxng
16
+
17
+
18
+ async def aggregate_search(
19
+ query: str,
20
+ max_results: int = 15,
21
+ freshness: str = "any",
22
+ include_wikipedia: bool = True,
23
+ include_domains: Optional[list[str]] = None,
24
+ exclude_domains: Optional[list[str]] = None,
25
+ ) -> list[dict]:
26
+ """
27
+ Aggregate search results from multiple sources in parallel.
28
+
29
+ Args:
30
+ query: Search query
31
+ max_results: Maximum total results to return
32
+ freshness: Freshness filter (day, week, month, year, any)
33
+ include_wikipedia: Whether to include Wikipedia results
34
+ include_domains: Only include these domains (Tavily only)
35
+ exclude_domains: Exclude these domains (Tavily only)
36
+
37
+ Returns:
38
+ Deduplicated, merged list of search results
39
+ """
40
+ settings = get_settings()
41
+
42
+ # Build list of search tasks
43
+ tasks = []
44
+ source_names = []
45
+
46
+ # SearXNG (if configured - free, high volume)
47
+ if hasattr(settings, 'searxng_url') and settings.searxng_url:
48
+ time_range = {"day": "day", "week": "week", "month": "month"}.get(freshness)
49
+ tasks.append(search_searxng(
50
+ query=query,
51
+ max_results=15,
52
+ time_range=time_range,
53
+ ))
54
+ source_names.append("searxng")
55
+
56
+ # Tavily (primary source - if API key available)
57
+ if settings.tavily_api_key:
58
+ tasks.append(search_tavily(
59
+ query=query,
60
+ max_results=12, # Primary source
61
+ freshness=freshness,
62
+ include_domains=include_domains,
63
+ exclude_domains=exclude_domains,
64
+ ))
65
+ source_names.append("tavily")
66
+
67
+ # Brave (secondary - limited quota, use sparingly)
68
+ if settings.brave_api_key:
69
+ tasks.append(search_brave(
70
+ query=query,
71
+ max_results=5, # Reduced to save quota
72
+ freshness=freshness,
73
+ ))
74
+ source_names.append("brave")
75
+
76
+ # DuckDuckGo (always available, free)
77
+ tasks.append(search_duckduckgo(
78
+ query=query,
79
+ max_results=12, # Free, can use more
80
+ ))
81
+ source_names.append("duckduckgo")
82
+
83
+ # Wikipedia (for context/background)
84
+ if include_wikipedia:
85
+ tasks.append(search_wikipedia(
86
+ query=query,
87
+ max_results=5,
88
+ ))
89
+ source_names.append("wikipedia")
90
+
91
+ # Run all searches in parallel
92
+ results_lists = await asyncio.gather(*tasks, return_exceptions=True)
93
+
94
+ # Merge results
95
+ all_results = []
96
+ for i, results in enumerate(results_lists):
97
+ if isinstance(results, Exception):
98
+ print(f"Source {source_names[i]} failed: {results}")
99
+ continue
100
+ if results:
101
+ all_results.extend(results)
102
+
103
+ # Deduplicate by URL
104
+ seen_urls = set()
105
+ unique_results = []
106
+
107
+ for result in all_results:
108
+ url = result.get("url", "")
109
+ normalized_url = _normalize_url(url)
110
+
111
+ if normalized_url not in seen_urls:
112
+ seen_urls.add(normalized_url)
113
+ unique_results.append(result)
114
+
115
+ # Sort by score (descending)
116
+ unique_results.sort(key=lambda x: x.get("score", 0), reverse=True)
117
+
118
+ return unique_results[:max_results]
119
+
120
+
121
+ def _normalize_url(url: str) -> str:
122
+ """Normalize URL for deduplication."""
123
+ try:
124
+ parsed = urlparse(url)
125
+ # Remove www., trailing slashes, and query params for comparison
126
+ host = parsed.netloc.replace("www.", "")
127
+ path = parsed.path.rstrip("/")
128
+ return f"{host}{path}".lower()
129
+ except:
130
+ return url.lower()
131
+
132
+
133
+ async def get_available_sources() -> list[str]:
134
+ """Get list of available search sources based on configuration."""
135
+ settings = get_settings()
136
+ sources = ["duckduckgo", "wikipedia"] # Always available
137
+
138
+ if hasattr(settings, 'searxng_url') and settings.searxng_url:
139
+ sources.append("searxng")
140
+ if settings.tavily_api_key:
141
+ sources.append("tavily")
142
+ if settings.brave_api_key:
143
+ sources.append("brave")
144
+
145
+ return sources
app/sources/brave.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Brave Search API source.
2
+
3
+ Official Brave Search API with 2000 free queries/month.
4
+ https://api.search.brave.com/
5
+ """
6
+
7
+ from datetime import datetime
8
+ from typing import Optional
9
+
10
+ import httpx
11
+
12
+ from app.config import get_settings
13
+
14
+
15
+ async def search_brave(
16
+ query: str,
17
+ max_results: int = 10,
18
+ freshness: str = "any",
19
+ country: str = "BR",
20
+ ) -> list[dict]:
21
+ """
22
+ Search using Brave Search API.
23
+
24
+ Args:
25
+ query: Search query
26
+ max_results: Maximum results (1-20)
27
+ freshness: 'pd' (day), 'pw' (week), 'pm' (month), 'py' (year), or None
28
+ country: Country code for results
29
+
30
+ Returns:
31
+ List of search results with title, url, content, published_date, score
32
+ """
33
+ settings = get_settings()
34
+
35
+ if not settings.brave_api_key:
36
+ return []
37
+
38
+ # Map freshness to Brave format
39
+ freshness_map = {
40
+ "day": "pd",
41
+ "week": "pw",
42
+ "month": "pm",
43
+ "year": "py",
44
+ "any": None,
45
+ }
46
+ brave_freshness = freshness_map.get(freshness)
47
+
48
+ params = {
49
+ "q": query,
50
+ "count": min(max_results, 20),
51
+ "country": country,
52
+ "search_lang": "pt",
53
+ "text_decorations": False,
54
+ }
55
+
56
+ if brave_freshness:
57
+ params["freshness"] = brave_freshness
58
+
59
+ headers = {
60
+ "Accept": "application/json",
61
+ "X-Subscription-Token": settings.brave_api_key,
62
+ }
63
+
64
+ try:
65
+ async with httpx.AsyncClient(timeout=15.0) as client:
66
+ response = await client.get(
67
+ "https://api.search.brave.com/res/v1/web/search",
68
+ params=params,
69
+ headers=headers,
70
+ )
71
+ response.raise_for_status()
72
+ data = response.json()
73
+
74
+ results = []
75
+ web_results = data.get("web", {}).get("results", [])
76
+
77
+ for i, item in enumerate(web_results):
78
+ # Try to parse age/date
79
+ published_date = None
80
+ age = item.get("age")
81
+ if age:
82
+ published_date = _parse_brave_age(age)
83
+
84
+ results.append({
85
+ "title": item.get("title", ""),
86
+ "url": item.get("url", ""),
87
+ "content": item.get("description", ""),
88
+ "published_date": published_date,
89
+ "score": 0.8 - (i * 0.05), # Decay score by position
90
+ "source": "brave",
91
+ })
92
+
93
+ return results
94
+
95
+ except httpx.HTTPStatusError as e:
96
+ print(f"Brave API error: {e.response.status_code}")
97
+ return []
98
+ except Exception as e:
99
+ print(f"Brave search error: {e}")
100
+ return []
101
+
102
+
103
+ def _parse_brave_age(age: str) -> Optional[datetime]:
104
+ """Parse Brave's age string like '2 days ago' to datetime."""
105
+ import re
106
+
107
+ now = datetime.now()
108
+
109
+ patterns = [
110
+ (r"(\d+)\s*hour", lambda m: now.replace(hour=now.hour - int(m.group(1)))),
111
+ (r"(\d+)\s*day", lambda m: now.replace(day=now.day - int(m.group(1)))),
112
+ (r"(\d+)\s*week", lambda m: now.replace(day=now.day - int(m.group(1)) * 7)),
113
+ (r"(\d+)\s*month", lambda m: now.replace(month=now.month - int(m.group(1)))),
114
+ ]
115
+
116
+ for pattern, func in patterns:
117
+ match = re.search(pattern, age, re.IGNORECASE)
118
+ if match:
119
+ try:
120
+ return func(match)
121
+ except ValueError:
122
+ pass
123
+
124
+ return None
app/sources/duckduckgo.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """DuckDuckGo search source (free fallback).
2
+
3
+ Uses the duckduckgo_search library for free web search.
4
+ """
5
+
6
+ from datetime import datetime, timedelta
7
+ from typing import Optional
8
+
9
+ import httpx
10
+
11
+
12
+ async def search_duckduckgo(
13
+ query: str,
14
+ max_results: int = 10,
15
+ region: str = "wt-wt", # Worldwide
16
+ ) -> list[dict]:
17
+ """
18
+ Search using DuckDuckGo (free, no API key required).
19
+
20
+ This is a fallback when other sources are unavailable.
21
+ Uses the HTML endpoint for basic search.
22
+
23
+ Args:
24
+ query: Search query
25
+ max_results: Maximum results to return
26
+ region: Region code
27
+
28
+ Returns:
29
+ List of result dicts with title, url, content
30
+ """
31
+ try:
32
+ # Use DuckDuckGo HTML API (lightweight, no JS needed)
33
+ params = {
34
+ "q": query,
35
+ "kl": region,
36
+ "kp": "-1", # Safe search off
37
+ }
38
+
39
+ headers = {
40
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
41
+ }
42
+
43
+ async with httpx.AsyncClient(timeout=15.0) as client:
44
+ # Use DuckDuckGo Lite (simpler to parse)
45
+ response = await client.get(
46
+ "https://lite.duckduckgo.com/lite/",
47
+ params=params,
48
+ headers=headers,
49
+ follow_redirects=True,
50
+ )
51
+ response.raise_for_status()
52
+ html = response.text
53
+
54
+ # Simple HTML parsing for results
55
+ results = parse_ddg_lite_results(html, max_results)
56
+ return results
57
+
58
+ except Exception as e:
59
+ print(f"DuckDuckGo search error: {e}")
60
+ return []
61
+
62
+
63
+ def parse_ddg_lite_results(html: str, max_results: int) -> list[dict]:
64
+ """
65
+ Parse DuckDuckGo Lite HTML results.
66
+
67
+ This is a simple parser for the lite version of DDG.
68
+ """
69
+ import re
70
+
71
+ results = []
72
+
73
+ # Find all result links (class="result-link")
74
+ # Pattern: <a rel="nofollow" href="URL" class='result-link'>TITLE</a>
75
+ link_pattern = r'<a[^>]*class=["\']result-link["\'][^>]*href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>'
76
+
77
+ # Find snippets (class="result-snippet")
78
+ snippet_pattern = r'<td[^>]*class=["\']result-snippet["\'][^>]*>([^<]+)</td>'
79
+
80
+ links = re.findall(link_pattern, html, re.IGNORECASE)
81
+ snippets = re.findall(snippet_pattern, html, re.IGNORECASE)
82
+
83
+ for i, (url, title) in enumerate(links[:max_results]):
84
+ content = snippets[i] if i < len(snippets) else ""
85
+
86
+ # Clean up HTML entities
87
+ title = title.strip()
88
+ content = content.strip()
89
+
90
+ # Skip DuckDuckGo internal links
91
+ if "duckduckgo.com" in url:
92
+ continue
93
+
94
+ results.append({
95
+ "title": title,
96
+ "url": url,
97
+ "content": content,
98
+ "published_date": None, # DDG Lite doesn't provide dates
99
+ "score": 0.5, # Neutral score, will be reranked
100
+ "source": "duckduckgo",
101
+ })
102
+
103
+ return results[:max_results]
app/sources/images.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image Search source.
2
+
3
+ Uses Tavily API with include_images=True for image search.
4
+ Falls back to Brave Image Search if Tavily unavailable.
5
+ """
6
+
7
+ from typing import Optional
8
+
9
+ import httpx
10
+
11
+ from app.config import get_settings
12
+
13
+
14
+ async def search_images(
15
+ query: str,
16
+ max_results: int = 6,
17
+ ) -> list[dict]:
18
+ """
19
+ Search for images using available APIs.
20
+
21
+ Priority:
22
+ 1. Tavily (include_images=True) - uses existing API key
23
+ 2. Brave Image Search - fallback
24
+
25
+ Args:
26
+ query: Search query
27
+ max_results: Maximum images to return
28
+
29
+ Returns:
30
+ List of image results with url, thumbnail, title
31
+ """
32
+ settings = get_settings()
33
+
34
+ # Try Tavily first (same API key as main search)
35
+ if settings.tavily_api_key:
36
+ images = await _search_tavily_images(query, max_results)
37
+ if images:
38
+ return images
39
+
40
+ # Fallback to Brave
41
+ if settings.brave_api_key:
42
+ return await _search_brave_images(query, max_results)
43
+
44
+ return []
45
+
46
+
47
+ async def _search_tavily_images(query: str, max_results: int) -> list[dict]:
48
+ """Search images using Tavily API."""
49
+ settings = get_settings()
50
+
51
+ payload = {
52
+ "api_key": settings.tavily_api_key,
53
+ "query": query,
54
+ "search_depth": "basic",
55
+ "max_results": 5, # We just need images, not full results
56
+ "include_images": True,
57
+ "include_image_descriptions": True,
58
+ "include_answer": False,
59
+ }
60
+
61
+ try:
62
+ async with httpx.AsyncClient(timeout=10.0) as client:
63
+ response = await client.post(
64
+ "https://api.tavily.com/search",
65
+ json=payload,
66
+ )
67
+ response.raise_for_status()
68
+ data = response.json()
69
+
70
+ results = []
71
+ images = data.get("images", [])
72
+
73
+ for img in images[:max_results]:
74
+ if isinstance(img, str):
75
+ # Simple URL format
76
+ results.append({
77
+ "url": img,
78
+ "thumbnail": img,
79
+ "title": "",
80
+ })
81
+ elif isinstance(img, dict):
82
+ # Dict format with description
83
+ results.append({
84
+ "url": img.get("url", ""),
85
+ "thumbnail": img.get("url", ""),
86
+ "title": img.get("description", ""),
87
+ })
88
+
89
+ return results
90
+
91
+ except Exception as e:
92
+ print(f"Tavily image search error: {e}")
93
+ return []
94
+
95
+
96
+ async def _search_brave_images(query: str, max_results: int) -> list[dict]:
97
+ """Search images using Brave Image Search API."""
98
+ settings = get_settings()
99
+
100
+ params = {
101
+ "q": query,
102
+ "count": min(max_results, 20),
103
+ "safesearch": "moderate",
104
+ }
105
+
106
+ headers = {
107
+ "Accept": "application/json",
108
+ "X-Subscription-Token": settings.brave_api_key,
109
+ }
110
+
111
+ try:
112
+ async with httpx.AsyncClient(timeout=10.0) as client:
113
+ response = await client.get(
114
+ "https://api.search.brave.com/res/v1/images/search",
115
+ params=params,
116
+ headers=headers,
117
+ )
118
+ response.raise_for_status()
119
+ data = response.json()
120
+
121
+ results = []
122
+ images = data.get("results", [])
123
+
124
+ for img in images[:max_results]:
125
+ results.append({
126
+ "url": img.get("properties", {}).get("url", ""),
127
+ "thumbnail": img.get("thumbnail", {}).get("src", ""),
128
+ "title": img.get("title", ""),
129
+ })
130
+
131
+ return results
132
+
133
+ except Exception as e:
134
+ print(f"Brave image search error: {e}")
135
+ return []
app/sources/scraper.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Content Scraper.
2
+
3
+ Extracts clean text content from URLs for deeper analysis.
4
+ """
5
+
6
+ import asyncio
7
+ from typing import Optional
8
+
9
+ import httpx
10
+
11
+
12
+ async def scrape_url_content(
13
+ url: str,
14
+ max_chars: int = 5000,
15
+ timeout: float = 10.0,
16
+ ) -> Optional[str]:
17
+ """
18
+ Scrape and extract clean text content from a URL.
19
+
20
+ Args:
21
+ url: URL to scrape
22
+ max_chars: Maximum characters to return
23
+ timeout: Request timeout in seconds
24
+
25
+ Returns:
26
+ Extracted text content or None if failed
27
+ """
28
+ try:
29
+ headers = {
30
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
31
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
32
+ "Accept-Language": "en-US,en;q=0.5",
33
+ }
34
+
35
+ async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
36
+ response = await client.get(url, headers=headers)
37
+ response.raise_for_status()
38
+ html = response.text
39
+
40
+ # Try trafilatura first (best quality)
41
+ try:
42
+ import trafilatura
43
+ text = trafilatura.extract(
44
+ html,
45
+ include_comments=False,
46
+ include_tables=True,
47
+ no_fallback=False,
48
+ )
49
+ if text:
50
+ return text[:max_chars]
51
+ except ImportError:
52
+ pass
53
+
54
+ # Fallback: simple HTML extraction
55
+ text = _simple_extract(html)
56
+ return text[:max_chars] if text else None
57
+
58
+ except Exception as e:
59
+ print(f"Scrape error for {url}: {e}")
60
+ return None
61
+
62
+
63
+ def _simple_extract(html: str) -> str:
64
+ """Simple HTML text extraction without external libs."""
65
+ import re
66
+
67
+ # Remove script and style tags
68
+ html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
69
+ html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
70
+ html = re.sub(r'<head[^>]*>.*?</head>', '', html, flags=re.DOTALL | re.IGNORECASE)
71
+ html = re.sub(r'<nav[^>]*>.*?</nav>', '', html, flags=re.DOTALL | re.IGNORECASE)
72
+ html = re.sub(r'<footer[^>]*>.*?</footer>', '', html, flags=re.DOTALL | re.IGNORECASE)
73
+
74
+ # Remove all HTML tags
75
+ text = re.sub(r'<[^>]+>', ' ', html)
76
+
77
+ # Clean up whitespace
78
+ text = re.sub(r'\s+', ' ', text)
79
+ text = text.strip()
80
+
81
+ return text
82
+
83
+
84
+ async def scrape_multiple_urls(
85
+ urls: list[str],
86
+ max_chars_per_url: int = 3000,
87
+ max_concurrent: int = 5,
88
+ ) -> dict[str, Optional[str]]:
89
+ """
90
+ Scrape multiple URLs concurrently.
91
+
92
+ Args:
93
+ urls: List of URLs to scrape
94
+ max_chars_per_url: Max chars per URL
95
+ max_concurrent: Max concurrent requests
96
+
97
+ Returns:
98
+ Dict mapping URL to extracted content (or None if failed)
99
+ """
100
+ semaphore = asyncio.Semaphore(max_concurrent)
101
+
102
+ async def scrape_with_semaphore(url: str) -> tuple[str, Optional[str]]:
103
+ async with semaphore:
104
+ content = await scrape_url_content(url, max_chars_per_url)
105
+ return url, content
106
+
107
+ tasks = [scrape_with_semaphore(url) for url in urls]
108
+ results = await asyncio.gather(*tasks)
109
+
110
+ return dict(results)
app/sources/searxng.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SearXNG meta-search source.
2
+
3
+ Uses a self-hosted SearXNG instance for comprehensive search results
4
+ from multiple engines (Google, Bing, DDG, etc.) without API costs.
5
+ """
6
+
7
+ from typing import Optional
8
+ from datetime import datetime
9
+
10
+ import httpx
11
+
12
+ from app.config import get_settings
13
+
14
+
15
+ # Default SearXNG instance (your HF Space)
16
+ DEFAULT_SEARXNG_URL = "https://madras1-searxng-space.hf.space"
17
+
18
+ # No fallbacks - use only your instance
19
+ FALLBACK_INSTANCES = []
20
+
21
+
22
+ async def search_searxng(
23
+ query: str,
24
+ max_results: int = 50,
25
+ categories: Optional[list[str]] = None,
26
+ engines: Optional[list[str]] = None,
27
+ language: str = "all",
28
+ time_range: Optional[str] = None,
29
+ searxng_url: Optional[str] = None,
30
+ ) -> list[dict]:
31
+ """
32
+ Search using SearXNG meta-search engine.
33
+
34
+ Returns many more results than API-based sources, making
35
+ embedding-based reranking valuable.
36
+
37
+ Args:
38
+ query: Search query
39
+ max_results: Maximum results to return (can be 50-100+)
40
+ categories: Search categories (general, news, science, etc.)
41
+ engines: Specific engines to use (google, bing, etc.)
42
+ language: Language code (en, pt, all)
43
+ time_range: Time filter (day, week, month, year)
44
+ searxng_url: Custom SearXNG instance URL
45
+
46
+ Returns:
47
+ List of search results with title, url, content, source
48
+ """
49
+ settings = get_settings()
50
+
51
+ # Build instance list
52
+ instances = []
53
+ if searxng_url:
54
+ instances.append(searxng_url)
55
+ if hasattr(settings, 'searxng_url') and settings.searxng_url:
56
+ instances.append(settings.searxng_url)
57
+ instances.append(DEFAULT_SEARXNG_URL)
58
+ instances.extend(FALLBACK_INSTANCES)
59
+
60
+ # Build params
61
+ params = {
62
+ "q": query,
63
+ "format": "json",
64
+ "language": language,
65
+ }
66
+
67
+ if categories:
68
+ params["categories"] = ",".join(categories)
69
+ if engines:
70
+ params["engines"] = ",".join(engines)
71
+ if time_range:
72
+ params["time_range"] = time_range
73
+
74
+ # Try each instance
75
+ for instance in instances:
76
+ try:
77
+ results = await _fetch_searxng(instance, params, max_results)
78
+ if results:
79
+ return results
80
+ except Exception as e:
81
+ print(f"SearXNG instance {instance} failed: {e}")
82
+ continue
83
+
84
+ return []
85
+
86
+
87
+ async def _fetch_searxng(
88
+ instance_url: str,
89
+ params: dict,
90
+ max_results: int,
91
+ ) -> list[dict]:
92
+ """Fetch results from a SearXNG instance."""
93
+
94
+ # Use browser-like headers to avoid blocks
95
+ headers = {
96
+ "Accept": "application/json",
97
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
98
+ "Accept-Language": "en-US,en;q=0.9",
99
+ }
100
+
101
+ async with httpx.AsyncClient(timeout=15.0) as client:
102
+ response = await client.get(
103
+ f"{instance_url.rstrip('/')}/search",
104
+ params=params,
105
+ headers=headers,
106
+ )
107
+ response.raise_for_status()
108
+ data = response.json()
109
+
110
+ results = []
111
+ for item in data.get("results", [])[:max_results]:
112
+ result = {
113
+ "title": item.get("title", ""),
114
+ "url": item.get("url", ""),
115
+ "content": item.get("content", ""),
116
+ "source": f"searxng:{item.get('engine', 'unknown')}",
117
+ "score": _calculate_score(item),
118
+ }
119
+
120
+ # Extract date if available
121
+ published_date = item.get("publishedDate")
122
+ if published_date:
123
+ result["published_date"] = published_date
124
+
125
+ results.append(result)
126
+
127
+ return results
128
+
129
+
130
+ def _calculate_score(item: dict) -> float:
131
+ """Calculate initial score based on position and engine."""
132
+ # Base score from position (if available)
133
+ position = item.get("position", 10)
134
+ position_score = max(0.3, 1.0 - (position * 0.05))
135
+
136
+ # Bonus for certain engines
137
+ engine = item.get("engine", "").lower()
138
+ engine_bonus = {
139
+ "google": 0.1,
140
+ "bing": 0.05,
141
+ "duckduckgo": 0.05,
142
+ "wikipedia": 0.1,
143
+ "arxiv": 0.15,
144
+ "google scholar": 0.15,
145
+ }.get(engine, 0)
146
+
147
+ return min(1.0, position_score + engine_bonus)
148
+
149
+
150
+ async def get_searxng_engines(searxng_url: Optional[str] = None) -> list[str]:
151
+ """Get list of available engines from SearXNG instance."""
152
+ url = searxng_url or DEFAULT_SEARXNG_URL
153
+
154
+ try:
155
+ async with httpx.AsyncClient(timeout=10.0) as client:
156
+ response = await client.get(f"{url}/config")
157
+ response.raise_for_status()
158
+ data = response.json()
159
+
160
+ return [
161
+ engine["name"]
162
+ for engine in data.get("engines", [])
163
+ if not engine.get("disabled", False)
164
+ ]
165
+ except Exception:
166
+ return []
app/sources/tavily.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tavily search source integration.
2
+
3
+ Tavily provides high-quality, AI-optimized search results.
4
+ """
5
+
6
+ from datetime import datetime
7
+ from typing import Literal, Optional
8
+
9
+ import httpx
10
+
11
+ from app.config import get_settings
12
+
13
+
14
+ async def search_tavily(
15
+ query: str,
16
+ max_results: int = 10,
17
+ freshness: Literal["day", "week", "month", "year", "any"] = "any",
18
+ include_domains: Optional[list[str]] = None,
19
+ exclude_domains: Optional[list[str]] = None,
20
+ search_depth: Literal["basic", "advanced"] = "advanced",
21
+ ) -> list[dict]:
22
+ """
23
+ Search using Tavily API.
24
+
25
+ Args:
26
+ query: Search query
27
+ max_results: Maximum results to return
28
+ freshness: Filter by recency
29
+ include_domains: Only include these domains
30
+ exclude_domains: Exclude these domains
31
+ search_depth: "basic" (fast) or "advanced" (thorough)
32
+
33
+ Returns:
34
+ List of result dicts with title, url, content, published_date, score
35
+ """
36
+ settings = get_settings()
37
+
38
+ if not settings.tavily_api_key:
39
+ return []
40
+
41
+ # Map freshness to Tavily's days parameter
42
+ days_map = {
43
+ "day": 1,
44
+ "week": 7,
45
+ "month": 30,
46
+ "year": 365,
47
+ "any": None,
48
+ }
49
+
50
+ payload = {
51
+ "api_key": settings.tavily_api_key,
52
+ "query": query,
53
+ "search_depth": search_depth,
54
+ "max_results": max_results,
55
+ "include_answer": False,
56
+ "include_raw_content": False,
57
+ }
58
+
59
+ # Add optional filters
60
+ if days_map.get(freshness):
61
+ payload["days"] = days_map[freshness]
62
+
63
+ if include_domains:
64
+ payload["include_domains"] = include_domains
65
+
66
+ if exclude_domains:
67
+ payload["exclude_domains"] = exclude_domains
68
+
69
+ try:
70
+ async with httpx.AsyncClient(timeout=30.0) as client:
71
+ response = await client.post(
72
+ "https://api.tavily.com/search",
73
+ json=payload,
74
+ )
75
+ response.raise_for_status()
76
+ data = response.json()
77
+
78
+ results = []
79
+ for item in data.get("results", []):
80
+ # Parse published date if available
81
+ pub_date = None
82
+ if "published_date" in item and item["published_date"]:
83
+ try:
84
+ pub_date = datetime.fromisoformat(
85
+ item["published_date"].replace("Z", "+00:00")
86
+ )
87
+ except (ValueError, TypeError):
88
+ pass
89
+
90
+ results.append({
91
+ "title": item.get("title", ""),
92
+ "url": item.get("url", ""),
93
+ "content": item.get("content", ""),
94
+ "published_date": pub_date,
95
+ "score": item.get("score", 0.5),
96
+ "source": "tavily",
97
+ })
98
+
99
+ return results
100
+
101
+ except httpx.HTTPError as e:
102
+ print(f"Tavily search error: {e}")
103
+ return []
104
+ except Exception as e:
105
+ print(f"Tavily unexpected error: {e}")
106
+ return []
app/sources/wikipedia.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Wikipedia Search source.
2
+
3
+ Uses Wikipedia's free API for background/context information.
4
+ No API key required, unlimited usage.
5
+ """
6
+
7
+ from datetime import datetime
8
+ from typing import Optional
9
+
10
+ import httpx
11
+
12
+
13
+ async def search_wikipedia(
14
+ query: str,
15
+ max_results: int = 5,
16
+ language: str = "pt",
17
+ ) -> list[dict]:
18
+ """
19
+ Search Wikipedia for relevant articles.
20
+
21
+ Args:
22
+ query: Search query
23
+ max_results: Maximum results (1-10)
24
+ language: Wikipedia language code (pt, en, es, etc)
25
+
26
+ Returns:
27
+ List of search results with title, url, content, score
28
+ """
29
+ base_url = f"https://{language}.wikipedia.org/w/api.php"
30
+
31
+ # First, search for pages
32
+ search_params = {
33
+ "action": "query",
34
+ "list": "search",
35
+ "srsearch": query,
36
+ "srlimit": min(max_results, 10),
37
+ "format": "json",
38
+ "utf8": 1,
39
+ }
40
+
41
+ try:
42
+ async with httpx.AsyncClient(timeout=10.0) as client:
43
+ # Search for articles
44
+ response = await client.get(base_url, params=search_params)
45
+ response.raise_for_status()
46
+ search_data = response.json()
47
+
48
+ results = []
49
+ search_results = search_data.get("query", {}).get("search", [])
50
+
51
+ for i, item in enumerate(search_results):
52
+ title = item.get("title", "")
53
+ page_id = item.get("pageid")
54
+ snippet = item.get("snippet", "")
55
+
56
+ # Clean HTML from snippet
57
+ snippet = _clean_html(snippet)
58
+
59
+ # Get extract for better content
60
+ extract = await _get_page_extract(client, base_url, page_id)
61
+
62
+ results.append({
63
+ "title": f"Wikipedia: {title}",
64
+ "url": f"https://{language}.wikipedia.org/wiki/{title.replace(' ', '_')}",
65
+ "content": extract or snippet,
66
+ "published_date": None, # Wikipedia doesn't provide this easily
67
+ "score": 0.7 - (i * 0.05), # Lower base score (reference material)
68
+ "source": "wikipedia",
69
+ })
70
+
71
+ return results
72
+
73
+ except Exception as e:
74
+ print(f"Wikipedia search error: {e}")
75
+ return []
76
+
77
+
78
+ async def _get_page_extract(
79
+ client: httpx.AsyncClient,
80
+ base_url: str,
81
+ page_id: int,
82
+ ) -> Optional[str]:
83
+ """Get a short extract from a Wikipedia page."""
84
+ params = {
85
+ "action": "query",
86
+ "pageids": page_id,
87
+ "prop": "extracts",
88
+ "exintro": True,
89
+ "explaintext": True,
90
+ "exsentences": 5,
91
+ "format": "json",
92
+ }
93
+
94
+ try:
95
+ response = await client.get(base_url, params=params)
96
+ data = response.json()
97
+ pages = data.get("query", {}).get("pages", {})
98
+ page = pages.get(str(page_id), {})
99
+ return page.get("extract", "")
100
+ except:
101
+ return None
102
+
103
+
104
+ def _clean_html(text: str) -> str:
105
+ """Remove HTML tags from text."""
106
+ import re
107
+ clean = re.sub(r'<[^>]+>', '', text)
108
+ return clean.strip()
app/temporal/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Temporal intelligence module."""
app/temporal/freshness_scorer.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Freshness scoring for search results.
2
+
3
+ Calculates how fresh/recent content is using exponential decay.
4
+ """
5
+
6
+ import math
7
+ from datetime import datetime
8
+ from typing import Optional
9
+
10
+ from app.config import get_settings
11
+
12
+
13
+ def calculate_freshness_score(
14
+ published_date: Optional[datetime | str] = None,
15
+ half_life_days: Optional[int] = None,
16
+ ) -> float:
17
+ """
18
+ Calculate freshness score using exponential decay.
19
+
20
+ The score decays exponentially based on content age:
21
+ - Just published: ~1.0
22
+ - half_life_days old: ~0.5
23
+ - 2x half_life_days old: ~0.25
24
+ - Very old: approaches 0
25
+
26
+ Args:
27
+ published_date: When the content was published
28
+ half_life_days: Days until score halves (default from settings)
29
+
30
+ Returns:
31
+ Freshness score between 0.0 and 1.0
32
+ """
33
+ if published_date is None:
34
+ # Unknown date gets neutral score
35
+ return 0.5
36
+
37
+ settings = get_settings()
38
+ if half_life_days is None:
39
+ half_life_days = settings.default_freshness_half_life
40
+
41
+ # Parse string dates if needed
42
+ if isinstance(published_date, str):
43
+ try:
44
+ # Try common formats
45
+ for fmt in ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"]:
46
+ try:
47
+ published_date = datetime.strptime(published_date, fmt)
48
+ break
49
+ except ValueError:
50
+ continue
51
+ else:
52
+ return 0.5 # Couldn't parse, neutral score
53
+ except Exception:
54
+ return 0.5
55
+
56
+ # Calculate age in days
57
+ now = datetime.now()
58
+ if published_date > now:
59
+ # Future date (probably an error), treat as very fresh
60
+ return 1.0
61
+
62
+ age_days = (now - published_date).days
63
+
64
+ # Exponential decay: score = e^(-λt) where λ = ln(2) / half_life
65
+ decay_constant = 0.693147 / half_life_days # ln(2)
66
+ score = math.exp(-decay_constant * age_days)
67
+
68
+ # Ensure score is in valid range
69
+ return max(0.01, min(1.0, score))
70
+
71
+
72
+ def get_freshness_label(score: float) -> str:
73
+ """
74
+ Get a human-readable label for a freshness score.
75
+
76
+ Args:
77
+ score: Freshness score 0-1
78
+
79
+ Returns:
80
+ Label like "Very Fresh", "Recent", "Dated", etc.
81
+ """
82
+ if score >= 0.9:
83
+ return "Very Fresh"
84
+ elif score >= 0.7:
85
+ return "Fresh"
86
+ elif score >= 0.5:
87
+ return "Recent"
88
+ elif score >= 0.3:
89
+ return "Dated"
90
+ elif score >= 0.1:
91
+ return "Old"
92
+ else:
93
+ return "Very Old"
94
+
95
+
96
+ def adjust_score_by_freshness(
97
+ base_score: float,
98
+ freshness_score: float,
99
+ temporal_urgency: float,
100
+ ) -> float:
101
+ """
102
+ Adjust a result's relevance score based on freshness.
103
+
104
+ When temporal_urgency is high, freshness matters more.
105
+ When temporal_urgency is low, freshness matters less.
106
+
107
+ Args:
108
+ base_score: Original relevance score (0-1)
109
+ freshness_score: How fresh the content is (0-1)
110
+ temporal_urgency: How important freshness is for this query (0-1)
111
+
112
+ Returns:
113
+ Adjusted score (0-1)
114
+ """
115
+ # Weight freshness by temporal urgency
116
+ freshness_weight = temporal_urgency * 0.4 # Max 40% impact from freshness
117
+ base_weight = 1.0 - freshness_weight
118
+
119
+ adjusted = (base_score * base_weight) + (freshness_score * freshness_weight)
120
+
121
+ return max(0.0, min(1.0, adjusted))
app/temporal/intent_detector.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Temporal intent detection for search queries.
2
+
3
+ Analyzes queries to determine if they require fresh/current information
4
+ or if historical information is acceptable.
5
+ """
6
+
7
+ import re
8
+ from datetime import datetime
9
+ from typing import Literal
10
+
11
+
12
+ def _get_dynamic_years() -> set[str]:
13
+ """Get current and previous year dynamically."""
14
+ current_year = datetime.now().year
15
+ return {str(current_year), str(current_year - 1)}
16
+
17
+
18
+ # Keywords that strongly indicate need for current information
19
+ FRESHNESS_KEYWORDS = {
20
+ # English
21
+ "latest", "newest", "recent", "current", "today", "now",
22
+ "this week", "this month", "this year", "breaking",
23
+ "update", "updates", "new", "just", "announced",
24
+ *_get_dynamic_years(), # Dynamic years
25
+ # Portuguese
26
+ "último", "últimos", "recente", "atual", "hoje", "agora",
27
+ "essa semana", "esse mês", "esse ano", "novidade",
28
+ "atualização", "novo", "novos", "anunciado",
29
+ }
30
+
31
+ # Keywords that indicate historical queries (less urgent freshness)
32
+ HISTORICAL_KEYWORDS = {
33
+ "history", "historical", "origin", "origins", "invented",
34
+ "founded", "first", "original", "classic", "traditional",
35
+ "história", "histórico", "origem", "inventado", "fundado",
36
+ }
37
+
38
+ # Entity types that typically require fresh information
39
+ FRESH_ENTITY_PATTERNS = [
40
+ r"\b(?:price|prices|stock|stocks|market)\b", # Financial
41
+ r"\b(?:weather|forecast|temperature)\b", # Weather
42
+ r"\b(?:news|headlines|breaking)\b", # News
43
+ r"\b(?:score|scores|game|match|vs)\b", # Sports
44
+ r"\b(?:version|release|update|patch)\b", # Software
45
+ r"\b(?:gpt-?\d|claude|gemini|llama|mistral)\b", # AI models
46
+ ]
47
+
48
+
49
+ def detect_temporal_intent(
50
+ query: str,
51
+ ) -> tuple[Literal["current", "historical", "neutral"], float]:
52
+ """
53
+ Detect the temporal intent of a search query.
54
+
55
+ Args:
56
+ query: The search query string
57
+
58
+ Returns:
59
+ Tuple of (intent, urgency) where:
60
+ - intent: "current", "historical", or "neutral"
61
+ - urgency: float 0-1 indicating how important freshness is
62
+ """
63
+ query_lower = query.lower()
64
+
65
+ # Count freshness indicators
66
+ freshness_score = 0.0
67
+ historical_score = 0.0
68
+
69
+ # Check for freshness keywords
70
+ for keyword in FRESHNESS_KEYWORDS:
71
+ if keyword in query_lower:
72
+ freshness_score += 0.3
73
+
74
+ # Check for historical keywords
75
+ for keyword in HISTORICAL_KEYWORDS:
76
+ if keyword in query_lower:
77
+ historical_score += 0.3
78
+
79
+ # Check for fresh entity patterns
80
+ for pattern in FRESH_ENTITY_PATTERNS:
81
+ if re.search(pattern, query_lower):
82
+ freshness_score += 0.2
83
+
84
+ # Question words that often imply current info needed
85
+ if re.search(r"\b(?:what is|who is|how to|where is)\b", query_lower):
86
+ freshness_score += 0.1
87
+
88
+ # Superlatives often need current info
89
+ if re.search(r"\b(?:best|top|most|fastest|cheapest)\b", query_lower):
90
+ freshness_score += 0.15
91
+
92
+ # Normalize scores
93
+ freshness_score = min(freshness_score, 1.0)
94
+ historical_score = min(historical_score, 1.0)
95
+
96
+ # Determine intent
97
+ if freshness_score > historical_score and freshness_score > 0.2:
98
+ intent = "current"
99
+ urgency = min(0.3 + freshness_score, 1.0)
100
+ elif historical_score > freshness_score and historical_score > 0.2:
101
+ intent = "historical"
102
+ urgency = max(0.2 - historical_score * 0.1, 0.1)
103
+ else:
104
+ intent = "neutral"
105
+ urgency = 0.5
106
+
107
+ return intent, urgency