Upload 40 files
Browse files- app/__init__.py +3 -0
- app/agents/__init__.py +1 -0
- app/agents/browser_agent.py +291 -0
- app/agents/browser_agent_v2.py +234 -0
- app/agents/browser_agent_v3.py +230 -0
- app/agents/deep_research.py +236 -0
- app/agents/flaresolverr.py +128 -0
- app/agents/graph/__init__.py +1 -0
- app/agents/graph/nodes.py +338 -0
- app/agents/graph/runner.py +133 -0
- app/agents/graph/simple_agent.py +321 -0
- app/agents/graph/state.py +128 -0
- app/agents/heavy_search.py +192 -0
- app/agents/llm_client.py +192 -0
- app/agents/planner.py +133 -0
- app/agents/synthesizer.py +173 -0
- app/api/__init__.py +1 -0
- app/api/routes/__init__.py +1 -0
- app/api/routes/search.py +579 -0
- app/api/schemas.py +159 -0
- app/config.py +64 -0
- app/main.py +72 -0
- app/middleware/__init__.py +1 -0
- app/middleware/rate_limiter.py +45 -0
- app/reranking/__init__.py +1 -0
- app/reranking/authority_scorer.py +134 -0
- app/reranking/embeddings.py +102 -0
- app/reranking/pipeline.py +127 -0
- app/sources/__init__.py +1 -0
- app/sources/aggregator.py +145 -0
- app/sources/brave.py +124 -0
- app/sources/duckduckgo.py +103 -0
- app/sources/images.py +135 -0
- app/sources/scraper.py +110 -0
- app/sources/searxng.py +166 -0
- app/sources/tavily.py +106 -0
- app/sources/wikipedia.py +108 -0
- app/temporal/__init__.py +1 -0
- app/temporal/freshness_scorer.py +121 -0
- app/temporal/intent_detector.py +107 -0
app/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Lancer - Advanced AI Search API"""
|
| 2 |
+
|
| 3 |
+
__version__ = "0.1.0"
|
app/agents/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Agents module."""
|
app/agents/browser_agent.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Browser Agent - Chrome with live stream and agent memory.
|
| 2 |
+
|
| 3 |
+
Uses E2B Desktop sandbox with Chrome browser.
|
| 4 |
+
Time limit: 5 minutes (300 seconds)
|
| 5 |
+
Shows live video stream.
|
| 6 |
+
Includes full memory/history tracking via AgentState.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import json
|
| 11 |
+
import shlex
|
| 12 |
+
import logging
|
| 13 |
+
import base64
|
| 14 |
+
import time
|
| 15 |
+
from typing import AsyncGenerator, Optional
|
| 16 |
+
|
| 17 |
+
from app.config import get_settings
|
| 18 |
+
from app.agents.llm_client import generate_completion
|
| 19 |
+
from app.agents.graph.state import AgentState, NodeType
|
| 20 |
+
from app.agents.flaresolverr import is_cloudflare_blocked
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
MAX_TIME_SECONDS = 300 # 5 minutes
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
async def run_browser_agent(
|
| 28 |
+
task: str,
|
| 29 |
+
url: Optional[str] = None,
|
| 30 |
+
) -> AsyncGenerator[dict, None]:
|
| 31 |
+
"""Run browser agent with Chrome and live stream."""
|
| 32 |
+
settings = get_settings()
|
| 33 |
+
|
| 34 |
+
if not settings.e2b_api_key:
|
| 35 |
+
yield {"type": "error", "message": "E2B_API_KEY not configured"}
|
| 36 |
+
return
|
| 37 |
+
|
| 38 |
+
# Initialize agent state with memory
|
| 39 |
+
state = AgentState(
|
| 40 |
+
task=task,
|
| 41 |
+
url=url,
|
| 42 |
+
timeout_seconds=MAX_TIME_SECONDS,
|
| 43 |
+
start_time=time.time()
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
yield {"type": "status", "message": "🚀 Initializing agent..."}
|
| 47 |
+
|
| 48 |
+
desktop = None
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
from e2b_desktop import Sandbox
|
| 52 |
+
|
| 53 |
+
os.environ["E2B_API_KEY"] = settings.e2b_api_key
|
| 54 |
+
|
| 55 |
+
yield {"type": "status", "message": "🖥️ Creating virtual desktop..."}
|
| 56 |
+
desktop = Sandbox.create(timeout=600)
|
| 57 |
+
state.desktop = desktop
|
| 58 |
+
|
| 59 |
+
# Start streaming
|
| 60 |
+
stream_url = None
|
| 61 |
+
try:
|
| 62 |
+
desktop.stream.start(require_auth=True)
|
| 63 |
+
auth_key = desktop.stream.get_auth_key()
|
| 64 |
+
stream_url = desktop.stream.get_url(auth_key=auth_key)
|
| 65 |
+
yield {"type": "stream", "url": stream_url}
|
| 66 |
+
logger.info(f"Stream started: {stream_url}")
|
| 67 |
+
desktop.wait(2000)
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logger.warning(f"Could not start stream: {e}")
|
| 70 |
+
|
| 71 |
+
# Launch Chrome
|
| 72 |
+
yield {"type": "status", "message": "🌐 Launching browser..."}
|
| 73 |
+
|
| 74 |
+
if url:
|
| 75 |
+
start_url = url
|
| 76 |
+
else:
|
| 77 |
+
search_query = task.replace(' ', '+')
|
| 78 |
+
start_url = f"https://html.duckduckgo.com/html/?q={search_query}"
|
| 79 |
+
|
| 80 |
+
chrome_flags = "--no-sandbox --disable-gpu --start-maximized --no-first-run --disable-default-apps --disable-popup-blocking --disable-translate --no-default-browser-check"
|
| 81 |
+
desktop.commands.run(f"google-chrome {chrome_flags} {shlex.quote(start_url)} &", background=True)
|
| 82 |
+
desktop.wait(3000)
|
| 83 |
+
|
| 84 |
+
# Close dialogs
|
| 85 |
+
desktop.press("enter")
|
| 86 |
+
desktop.wait(1000)
|
| 87 |
+
|
| 88 |
+
# Add to memory
|
| 89 |
+
state.visited_urls.append(start_url)
|
| 90 |
+
state.add_action({"type": "navigate", "url": start_url})
|
| 91 |
+
|
| 92 |
+
# Main loop - time based with memory
|
| 93 |
+
while state.should_continue():
|
| 94 |
+
state.step_count += 1
|
| 95 |
+
elapsed = int(state.get_elapsed_time())
|
| 96 |
+
remaining = int(state.get_remaining_time())
|
| 97 |
+
|
| 98 |
+
yield {"type": "status", "message": f"🔍 Step {state.step_count}: Analyzing... ({elapsed}s / {MAX_TIME_SECONDS}s)"}
|
| 99 |
+
|
| 100 |
+
# Take screenshot
|
| 101 |
+
screenshot_bytes = desktop.screenshot()
|
| 102 |
+
screenshot_b64 = base64.b64encode(screenshot_bytes).decode('utf-8')
|
| 103 |
+
|
| 104 |
+
# Get page content
|
| 105 |
+
current_url = state.visited_urls[-1]
|
| 106 |
+
page_content = ""
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
result = desktop.commands.run(
|
| 110 |
+
f"curl -sL --max-time 10 --connect-timeout 5 "
|
| 111 |
+
f"-A 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0' "
|
| 112 |
+
f"{shlex.quote(current_url)} 2>/dev/null | "
|
| 113 |
+
"sed -e 's/<script[^>]*>.*<\\/script>//g' -e 's/<style[^>]*>.*<\\/style>//g' | "
|
| 114 |
+
"sed 's/<[^>]*>//g' | "
|
| 115 |
+
"tr -s ' \\n' ' ' | "
|
| 116 |
+
"head -c 6000",
|
| 117 |
+
timeout=15
|
| 118 |
+
)
|
| 119 |
+
page_content = result.stdout.strip() if hasattr(result, 'stdout') else ""
|
| 120 |
+
state.page_content = page_content
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.warning(f"Content extraction failed: {e}")
|
| 123 |
+
state.add_error(f"Content extraction failed: {e}")
|
| 124 |
+
|
| 125 |
+
# Check for Cloudflare block
|
| 126 |
+
is_blocked = is_cloudflare_blocked(page_content) if page_content else False
|
| 127 |
+
|
| 128 |
+
if is_blocked:
|
| 129 |
+
yield {"type": "status", "message": f"🚫 Cloudflare at {current_url[:40]}..., trying next link..."}
|
| 130 |
+
state.add_error(f"Cloudflare blocked: {current_url}")
|
| 131 |
+
else:
|
| 132 |
+
# Add to memory
|
| 133 |
+
state.extracted_data.append({
|
| 134 |
+
"url": current_url,
|
| 135 |
+
"content_length": len(page_content),
|
| 136 |
+
"preview": page_content[:200]
|
| 137 |
+
})
|
| 138 |
+
|
| 139 |
+
# Build prompt with memory context
|
| 140 |
+
memory_context = state.get_context_for_llm()
|
| 141 |
+
history_str = "\n".join([f"- {u}" for u in state.visited_urls[-5:]])
|
| 142 |
+
content_preview = page_content[:2000] if page_content else "(empty page)"
|
| 143 |
+
|
| 144 |
+
prompt = f"""You are a browser agent with memory. Analyze the page and decide the next action.
|
| 145 |
+
|
| 146 |
+
TASK: {task}
|
| 147 |
+
CURRENT URL: {current_url}
|
| 148 |
+
TIME REMAINING: {remaining}s
|
| 149 |
+
STEP: {state.step_count}
|
| 150 |
+
|
| 151 |
+
MEMORY:
|
| 152 |
+
{memory_context}
|
| 153 |
+
|
| 154 |
+
VISITED URLS:
|
| 155 |
+
{history_str}
|
| 156 |
+
|
| 157 |
+
PAGE CONTENT (blocked={is_blocked}):
|
| 158 |
+
{content_preview}
|
| 159 |
+
|
| 160 |
+
What should I do? Reply with JSON:
|
| 161 |
+
{{"action": "SEARCH|NAVIGATE|SCROLL|DONE", "value": "search query or URL", "reason": "brief reason"}}
|
| 162 |
+
|
| 163 |
+
- SEARCH: Search for something new (use if current results are insufficient)
|
| 164 |
+
- NAVIGATE: Go to a specific URL found on the page (MUST be different from visited URLs)
|
| 165 |
+
- SCROLL: Scroll down for more content
|
| 166 |
+
- DONE: Task is complete, provide final answer
|
| 167 |
+
|
| 168 |
+
RULES:
|
| 169 |
+
1. Do NOT navigate to already visited URLs
|
| 170 |
+
2. If blocked, navigate to a different link immediately
|
| 171 |
+
3. If you have enough info, respond with DONE
|
| 172 |
+
4. Include "answer" field when action is DONE"""
|
| 173 |
+
|
| 174 |
+
response = await generate_completion(
|
| 175 |
+
messages=[{"role": "user", "content": prompt}],
|
| 176 |
+
max_tokens=500
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# Parse response
|
| 180 |
+
try:
|
| 181 |
+
json_match = response[response.find('{'):response.rfind('}')+1]
|
| 182 |
+
decision = json.loads(json_match)
|
| 183 |
+
except:
|
| 184 |
+
logger.warning(f"Could not parse LLM response: {response[:200]}")
|
| 185 |
+
decision = {"action": "DONE", "answer": response}
|
| 186 |
+
|
| 187 |
+
action = decision.get("action", "DONE")
|
| 188 |
+
value = decision.get("value", "")
|
| 189 |
+
reason = decision.get("reason", "")
|
| 190 |
+
|
| 191 |
+
# Record action in memory
|
| 192 |
+
state.add_action({"type": action.lower(), "value": value, "reason": reason})
|
| 193 |
+
|
| 194 |
+
yield {"type": "status", "message": f"🤔 Action: {action} - {reason[:50]}"}
|
| 195 |
+
|
| 196 |
+
if action == "DONE":
|
| 197 |
+
state.success = True
|
| 198 |
+
final_answer = decision.get("answer", "")
|
| 199 |
+
|
| 200 |
+
if not final_answer:
|
| 201 |
+
# Generate from memory
|
| 202 |
+
all_content = "\n\n".join([
|
| 203 |
+
f"Source: {d['url']}\n{d.get('preview', '')}"
|
| 204 |
+
for d in state.extracted_data[-5:]
|
| 205 |
+
])
|
| 206 |
+
final_prompt = f"Based on this content, answer: {task}\n\nContent:\n{all_content}"
|
| 207 |
+
final_answer = await generate_completion(
|
| 208 |
+
messages=[{"role": "user", "content": final_prompt}],
|
| 209 |
+
max_tokens=1000
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
state.final_result = final_answer
|
| 213 |
+
|
| 214 |
+
yield {"type": "stream_end", "message": "Done"}
|
| 215 |
+
yield {
|
| 216 |
+
"type": "result",
|
| 217 |
+
"content": final_answer,
|
| 218 |
+
"links": state.visited_urls,
|
| 219 |
+
"steps": state.step_count,
|
| 220 |
+
"success": True
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
yield {"type": "complete", "message": f"Completed in {int(state.get_elapsed_time())}s with {state.step_count} steps"}
|
| 224 |
+
return
|
| 225 |
+
|
| 226 |
+
elif action == "SEARCH":
|
| 227 |
+
search_query = value.replace(' ', '+')
|
| 228 |
+
new_url = f"https://html.duckduckgo.com/html/?q={search_query}"
|
| 229 |
+
|
| 230 |
+
if new_url not in state.visited_urls:
|
| 231 |
+
desktop.commands.run(f"google-chrome {shlex.quote(new_url)} &", background=True)
|
| 232 |
+
desktop.wait(3000)
|
| 233 |
+
state.visited_urls.append(new_url)
|
| 234 |
+
|
| 235 |
+
elif action == "NAVIGATE":
|
| 236 |
+
if value and value.startswith("http"):
|
| 237 |
+
if value in state.visited_urls:
|
| 238 |
+
yield {"type": "status", "message": f"⏭️ Already visited, skipping..."}
|
| 239 |
+
state.add_error(f"Tried to revisit: {value}")
|
| 240 |
+
else:
|
| 241 |
+
desktop.commands.run(f"google-chrome {shlex.quote(value)} &", background=True)
|
| 242 |
+
desktop.wait(3000)
|
| 243 |
+
state.visited_urls.append(value)
|
| 244 |
+
|
| 245 |
+
elif action == "SCROLL":
|
| 246 |
+
desktop.press("pagedown")
|
| 247 |
+
desktop.wait(1500)
|
| 248 |
+
|
| 249 |
+
# Small delay
|
| 250 |
+
desktop.wait(1000)
|
| 251 |
+
|
| 252 |
+
# Timeout - generate from memory
|
| 253 |
+
yield {"type": "status", "message": "⏰ Time limit reached, generating final answer from memory..."}
|
| 254 |
+
|
| 255 |
+
all_content = "\n\n".join([
|
| 256 |
+
f"Source: {d['url']}\n{d.get('preview', '')}"
|
| 257 |
+
for d in state.extracted_data[-5:]
|
| 258 |
+
])
|
| 259 |
+
final_prompt = f"Based on this content, answer: {task}\n\nContent:\n{all_content}"
|
| 260 |
+
final_answer = await generate_completion(
|
| 261 |
+
messages=[{"role": "user", "content": final_prompt}],
|
| 262 |
+
max_tokens=1000
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
state.final_result = final_answer
|
| 266 |
+
|
| 267 |
+
yield {"type": "stream_end", "message": "Done"}
|
| 268 |
+
yield {
|
| 269 |
+
"type": "result",
|
| 270 |
+
"content": final_answer,
|
| 271 |
+
"links": state.visited_urls,
|
| 272 |
+
"steps": state.step_count,
|
| 273 |
+
"success": True
|
| 274 |
+
}
|
| 275 |
+
yield {"type": "complete", "message": f"Completed in {MAX_TIME_SECONDS}s (timeout) with {state.step_count} steps"}
|
| 276 |
+
|
| 277 |
+
except ImportError as e:
|
| 278 |
+
yield {"type": "error", "message": "e2b-desktop not installed"}
|
| 279 |
+
except Exception as e:
|
| 280 |
+
logger.exception("Browser agent error")
|
| 281 |
+
yield {"type": "error", "message": f"Error: {str(e)}"}
|
| 282 |
+
finally:
|
| 283 |
+
if desktop:
|
| 284 |
+
try:
|
| 285 |
+
desktop.stream.stop()
|
| 286 |
+
except:
|
| 287 |
+
pass
|
| 288 |
+
try:
|
| 289 |
+
desktop.kill()
|
| 290 |
+
except:
|
| 291 |
+
pass
|
app/agents/browser_agent_v2.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Browser Agent v2 - Uses Camoufox stealth browser inside E2B.
|
| 2 |
+
|
| 3 |
+
Camoufox = Firefox stealth que passa anti-bot.
|
| 4 |
+
Roda DENTRO do E2B sandbox.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
import shlex
|
| 11 |
+
import time
|
| 12 |
+
from typing import AsyncGenerator, Optional
|
| 13 |
+
|
| 14 |
+
from app.config import get_settings
|
| 15 |
+
from app.agents.llm_client import generate_completion
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
async def run_browser_agent_v2(
|
| 21 |
+
task: str,
|
| 22 |
+
url: Optional[str] = None,
|
| 23 |
+
) -> AsyncGenerator[dict, None]:
|
| 24 |
+
"""Run browser agent with Camoufox stealth browser inside E2B."""
|
| 25 |
+
settings = get_settings()
|
| 26 |
+
|
| 27 |
+
if not settings.e2b_api_key:
|
| 28 |
+
yield {"type": "error", "message": "E2B_API_KEY not configured"}
|
| 29 |
+
return
|
| 30 |
+
|
| 31 |
+
yield {"type": "status", "message": "🚀 Initializing agent..."}
|
| 32 |
+
|
| 33 |
+
desktop = None
|
| 34 |
+
start_time = time.time()
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from e2b_desktop import Sandbox
|
| 38 |
+
|
| 39 |
+
os.environ["E2B_API_KEY"] = settings.e2b_api_key
|
| 40 |
+
|
| 41 |
+
yield {"type": "status", "message": "🖥️ Creating sandbox..."}
|
| 42 |
+
desktop = Sandbox.create(timeout=900)
|
| 43 |
+
|
| 44 |
+
# Stream
|
| 45 |
+
stream_url = None
|
| 46 |
+
try:
|
| 47 |
+
desktop.stream.start(require_auth=True)
|
| 48 |
+
auth_key = desktop.stream.get_auth_key()
|
| 49 |
+
stream_url = desktop.stream.get_url(auth_key=auth_key)
|
| 50 |
+
yield {"type": "stream", "url": stream_url}
|
| 51 |
+
desktop.wait(2000)
|
| 52 |
+
except Exception as e:
|
| 53 |
+
logger.warning(f"Stream failed: {e}")
|
| 54 |
+
|
| 55 |
+
# Install Camoufox in E2B
|
| 56 |
+
yield {"type": "status", "message": "📦 Installing stealth browser (pip)..."}
|
| 57 |
+
|
| 58 |
+
try:
|
| 59 |
+
# Install packages
|
| 60 |
+
desktop.commands.run("pip install --user camoufox playwright -q", timeout=120)
|
| 61 |
+
|
| 62 |
+
yield {"type": "status", "message": "🔽 Downloading Firefox stealth (~30s)..."}
|
| 63 |
+
desktop.commands.run("camoufox fetch", timeout=180)
|
| 64 |
+
|
| 65 |
+
yield {"type": "status", "message": "🔧 Installing browser dependencies..."}
|
| 66 |
+
desktop.commands.run("sudo apt-get update -qq && sudo apt-get install -y -qq libgtk-3-0 libasound2 libdbus-glib-1-2 2>/dev/null || true", timeout=60)
|
| 67 |
+
|
| 68 |
+
yield {"type": "status", "message": "✅ Browser ready!"}
|
| 69 |
+
except Exception as e:
|
| 70 |
+
logger.error(f"Camoufox install failed: {e}")
|
| 71 |
+
yield {"type": "error", "message": f"Install failed: {e}"}
|
| 72 |
+
return
|
| 73 |
+
|
| 74 |
+
# Create and run scraper script
|
| 75 |
+
yield {"type": "status", "message": f"🔍 Searching: {task[:40]}..."}
|
| 76 |
+
|
| 77 |
+
script = _build_script(task, url)
|
| 78 |
+
|
| 79 |
+
# Write script
|
| 80 |
+
desktop.commands.run(
|
| 81 |
+
f"cat > /tmp/scrape.py << 'EOF'\n{script}\nEOF",
|
| 82 |
+
timeout=10
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
yield {"type": "status", "message": "🌐 Navigating with stealth browser..."}
|
| 86 |
+
|
| 87 |
+
# Run
|
| 88 |
+
result = desktop.commands.run("python3 /tmp/scrape.py", timeout=240)
|
| 89 |
+
output = result.stdout.strip() if hasattr(result, 'stdout') else ""
|
| 90 |
+
|
| 91 |
+
if not output:
|
| 92 |
+
yield {"type": "error", "message": "No output from scraper"}
|
| 93 |
+
return
|
| 94 |
+
|
| 95 |
+
# Parse
|
| 96 |
+
try:
|
| 97 |
+
data = json.loads(output)
|
| 98 |
+
content = data.get("content", "")
|
| 99 |
+
urls = data.get("urls", [])
|
| 100 |
+
error = data.get("error")
|
| 101 |
+
|
| 102 |
+
if error:
|
| 103 |
+
yield {"type": "error", "message": error}
|
| 104 |
+
return
|
| 105 |
+
except json.JSONDecodeError:
|
| 106 |
+
content = output[:4000]
|
| 107 |
+
urls = []
|
| 108 |
+
|
| 109 |
+
# Synthesize with LLM
|
| 110 |
+
yield {"type": "status", "message": "✨ Generating response..."}
|
| 111 |
+
|
| 112 |
+
prompt = f"""Analise e responda:
|
| 113 |
+
|
| 114 |
+
PERGUNTA: {task}
|
| 115 |
+
|
| 116 |
+
CONTEÚDO:
|
| 117 |
+
{content[:5000]}
|
| 118 |
+
|
| 119 |
+
Use **negrito** para valores importantes. Seja direto."""
|
| 120 |
+
|
| 121 |
+
response = await generate_completion(
|
| 122 |
+
messages=[{"role": "user", "content": prompt}],
|
| 123 |
+
max_tokens=1200
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
final = response.strip() if response else content[:1000]
|
| 127 |
+
|
| 128 |
+
# Done
|
| 129 |
+
yield {"type": "stream_end", "message": "Done"}
|
| 130 |
+
|
| 131 |
+
yield {
|
| 132 |
+
"type": "result",
|
| 133 |
+
"content": final,
|
| 134 |
+
"links": urls[:10],
|
| 135 |
+
"success": True
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
elapsed = int(time.time() - start_time)
|
| 139 |
+
yield {"type": "complete", "message": f"Done in {elapsed}s"}
|
| 140 |
+
|
| 141 |
+
except ImportError:
|
| 142 |
+
yield {"type": "error", "message": "e2b-desktop not installed"}
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.exception("Agent error")
|
| 145 |
+
yield {"type": "error", "message": str(e)}
|
| 146 |
+
finally:
|
| 147 |
+
if desktop:
|
| 148 |
+
try:
|
| 149 |
+
desktop.stream.stop()
|
| 150 |
+
except:
|
| 151 |
+
pass
|
| 152 |
+
try:
|
| 153 |
+
desktop.kill()
|
| 154 |
+
except:
|
| 155 |
+
pass
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def _build_script(task: str, url: Optional[str] = None) -> str:
|
| 159 |
+
"""Build Python script to run inside E2B with Camoufox."""
|
| 160 |
+
|
| 161 |
+
task_safe = task.replace("'", "\\'").replace('"', '\\"')
|
| 162 |
+
search_url = url or f"https://html.duckduckgo.com/html/?q={task.replace(' ', '+')}"
|
| 163 |
+
|
| 164 |
+
return f'''
|
| 165 |
+
import json
|
| 166 |
+
import sys
|
| 167 |
+
|
| 168 |
+
try:
|
| 169 |
+
from camoufox.sync_api import Camoufox
|
| 170 |
+
except:
|
| 171 |
+
print(json.dumps({{"error": "Camoufox not found"}}))
|
| 172 |
+
sys.exit(1)
|
| 173 |
+
|
| 174 |
+
urls = []
|
| 175 |
+
contents = []
|
| 176 |
+
|
| 177 |
+
def extract(page):
|
| 178 |
+
try:
|
| 179 |
+
return page.evaluate("""() => {{
|
| 180 |
+
document.querySelectorAll('script,style,noscript').forEach(e => e.remove());
|
| 181 |
+
return document.body.innerText || '';
|
| 182 |
+
}}""")[:4000]
|
| 183 |
+
except:
|
| 184 |
+
return ""
|
| 185 |
+
|
| 186 |
+
def is_blocked(text):
|
| 187 |
+
t = text.lower()
|
| 188 |
+
if len(text) < 500:
|
| 189 |
+
blocks = ["checking your browser", "cloudflare", "access denied", "blocked"]
|
| 190 |
+
return any(b in t for b in blocks)
|
| 191 |
+
return False
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
with Camoufox(headless=True) as browser:
|
| 195 |
+
page = browser.new_page()
|
| 196 |
+
|
| 197 |
+
# Search
|
| 198 |
+
page.goto("{search_url}", timeout=30000)
|
| 199 |
+
page.wait_for_timeout(2000)
|
| 200 |
+
urls.append("{search_url}")
|
| 201 |
+
|
| 202 |
+
content = extract(page)
|
| 203 |
+
if not is_blocked(content):
|
| 204 |
+
contents.append(content)
|
| 205 |
+
|
| 206 |
+
# Get links
|
| 207 |
+
links = page.evaluate("""() => {{
|
| 208 |
+
return Array.from(document.querySelectorAll('a[href^="http"]'))
|
| 209 |
+
.map(a => a.href)
|
| 210 |
+
.filter(h => !h.includes('duckduckgo') && !h.includes('google'))
|
| 211 |
+
.slice(0, 5);
|
| 212 |
+
}}""")
|
| 213 |
+
|
| 214 |
+
# Visit up to 2 links
|
| 215 |
+
for link in links[:2]:
|
| 216 |
+
if link in urls:
|
| 217 |
+
continue
|
| 218 |
+
try:
|
| 219 |
+
page.goto(link, timeout=20000)
|
| 220 |
+
page.wait_for_timeout(1500)
|
| 221 |
+
urls.append(link)
|
| 222 |
+
|
| 223 |
+
c = extract(page)
|
| 224 |
+
if not is_blocked(c):
|
| 225 |
+
contents.append(c)
|
| 226 |
+
except:
|
| 227 |
+
pass
|
| 228 |
+
|
| 229 |
+
result = "\\n\\n---\\n\\n".join(contents)
|
| 230 |
+
print(json.dumps({{"content": result[:8000], "urls": urls}}))
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
print(json.dumps({{"error": str(e)}}))
|
| 234 |
+
'''
|
app/agents/browser_agent_v3.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Browser Agent v3 - Batch extraction, minimal LLM calls.
|
| 2 |
+
|
| 3 |
+
Flow:
|
| 4 |
+
1. Search DuckDuckGo → Get top links
|
| 5 |
+
2. Batch extract content from 3-5 pages (NO LLM calls)
|
| 6 |
+
3. Send ALL content to LLM in ONE call
|
| 7 |
+
4. LLM either responds OR requests specific follow-up
|
| 8 |
+
|
| 9 |
+
Target: 2-4 LLM calls max instead of 40+
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import re
|
| 14 |
+
import shlex
|
| 15 |
+
import logging
|
| 16 |
+
import time
|
| 17 |
+
from typing import AsyncGenerator, Optional, List, Dict
|
| 18 |
+
|
| 19 |
+
from app.config import get_settings
|
| 20 |
+
from app.agents.llm_client import generate_completion
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Config
|
| 25 |
+
MAX_PAGES_TO_EXTRACT = 4
|
| 26 |
+
TIMEOUT_SECONDS = 300
|
| 27 |
+
CONTENT_PER_PAGE = 2000
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
async def run_browser_agent_v3(
|
| 31 |
+
task: str,
|
| 32 |
+
url: Optional[str] = None,
|
| 33 |
+
) -> AsyncGenerator[dict, None]:
|
| 34 |
+
"""Run browser agent with batch extraction - minimal LLM calls."""
|
| 35 |
+
settings = get_settings()
|
| 36 |
+
|
| 37 |
+
if not settings.e2b_api_key:
|
| 38 |
+
yield {"type": "error", "message": "E2B_API_KEY not configured"}
|
| 39 |
+
return
|
| 40 |
+
|
| 41 |
+
start_time = time.time()
|
| 42 |
+
yield {"type": "status", "message": "🚀 Initializing agent..."}
|
| 43 |
+
|
| 44 |
+
desktop = None
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
from e2b_desktop import Sandbox
|
| 48 |
+
|
| 49 |
+
os.environ["E2B_API_KEY"] = settings.e2b_api_key
|
| 50 |
+
|
| 51 |
+
yield {"type": "status", "message": "🖥️ Creating virtual desktop..."}
|
| 52 |
+
desktop = Sandbox.create(timeout=600)
|
| 53 |
+
|
| 54 |
+
# Start streaming
|
| 55 |
+
stream_url = None
|
| 56 |
+
try:
|
| 57 |
+
desktop.stream.start(require_auth=True)
|
| 58 |
+
auth_key = desktop.stream.get_auth_key()
|
| 59 |
+
stream_url = desktop.stream.get_url(auth_key=auth_key)
|
| 60 |
+
yield {"type": "stream", "url": stream_url}
|
| 61 |
+
desktop.wait(2000)
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logger.warning(f"Could not start stream: {e}")
|
| 64 |
+
|
| 65 |
+
# Launch Chrome
|
| 66 |
+
yield {"type": "status", "message": "🌐 Launching browser..."}
|
| 67 |
+
chrome_flags = "--no-sandbox --disable-gpu --start-maximized --no-first-run --disable-default-apps --disable-popup-blocking --disable-translate --no-default-browser-check"
|
| 68 |
+
desktop.commands.run(f"google-chrome {chrome_flags} 'about:blank' &", background=True)
|
| 69 |
+
desktop.wait(3000)
|
| 70 |
+
desktop.press("enter")
|
| 71 |
+
desktop.wait(1000)
|
| 72 |
+
|
| 73 |
+
# Phase 1: Search
|
| 74 |
+
yield {"type": "status", "message": f"🔍 Searching: {task[:50]}..."}
|
| 75 |
+
search_query = task.replace(' ', '+')
|
| 76 |
+
search_url = f"https://html.duckduckgo.com/html/?q={search_query}"
|
| 77 |
+
|
| 78 |
+
desktop.commands.run(f"google-chrome {shlex.quote(search_url)} &", background=True)
|
| 79 |
+
desktop.wait(3000)
|
| 80 |
+
|
| 81 |
+
# Extract search results page
|
| 82 |
+
search_content = await _extract_page_content(desktop, search_url)
|
| 83 |
+
|
| 84 |
+
# Parse links from search results
|
| 85 |
+
links = _extract_links_from_search(search_content, task)
|
| 86 |
+
logger.info(f"Found {len(links)} relevant links")
|
| 87 |
+
|
| 88 |
+
if not links:
|
| 89 |
+
# Fallback: just use search content
|
| 90 |
+
links = [search_url]
|
| 91 |
+
|
| 92 |
+
# Phase 2: Batch extract from top pages
|
| 93 |
+
extracted_pages: List[Dict] = []
|
| 94 |
+
|
| 95 |
+
for i, link in enumerate(links[:MAX_PAGES_TO_EXTRACT]):
|
| 96 |
+
remaining = int(TIMEOUT_SECONDS - (time.time() - start_time))
|
| 97 |
+
if remaining < 30:
|
| 98 |
+
break
|
| 99 |
+
|
| 100 |
+
yield {"type": "status", "message": f"📊 Extracting page {i+1}/{min(len(links), MAX_PAGES_TO_EXTRACT)}... ({remaining}s remaining)"}
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
desktop.commands.run(f"google-chrome {shlex.quote(link)} &", background=True)
|
| 104 |
+
desktop.wait(2500)
|
| 105 |
+
|
| 106 |
+
content = await _extract_page_content(desktop, link)
|
| 107 |
+
if content and len(content) > 100:
|
| 108 |
+
extracted_pages.append({
|
| 109 |
+
"url": link,
|
| 110 |
+
"content": content[:CONTENT_PER_PAGE]
|
| 111 |
+
})
|
| 112 |
+
logger.info(f"Extracted {len(content)} chars from {link[:50]}")
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.warning(f"Failed to extract {link}: {e}")
|
| 115 |
+
|
| 116 |
+
# Phase 3: ONE LLM call with all content
|
| 117 |
+
yield {"type": "status", "message": "🤔 Analyzing all sources..."}
|
| 118 |
+
|
| 119 |
+
# Build context
|
| 120 |
+
pages_context = "\n\n---\n\n".join([
|
| 121 |
+
f"SOURCE {i+1}: {p['url']}\n{p['content']}"
|
| 122 |
+
for i, p in enumerate(extracted_pages)
|
| 123 |
+
])
|
| 124 |
+
|
| 125 |
+
prompt = f"""Você é um assistente de pesquisa. Analise as fontes abaixo e responda à pergunta.
|
| 126 |
+
|
| 127 |
+
PERGUNTA: {task}
|
| 128 |
+
|
| 129 |
+
FONTES COLETADAS:
|
| 130 |
+
{pages_context if pages_context else "(Nenhum conteúdo extraído)"}
|
| 131 |
+
|
| 132 |
+
INSTRUÇÕES:
|
| 133 |
+
1. Responda baseado APENAS nas fontes acima
|
| 134 |
+
2. Use **negrito** para valores importantes (preços, números, nomes)
|
| 135 |
+
3. Cite as fontes quando possível (ex: "Segundo o site X...")
|
| 136 |
+
4. Se as fontes não respondem a pergunta, diga isso honestamente
|
| 137 |
+
5. Seja direto e organizado
|
| 138 |
+
|
| 139 |
+
Responda em português:"""
|
| 140 |
+
|
| 141 |
+
response = await generate_completion(
|
| 142 |
+
messages=[{"role": "user", "content": prompt}],
|
| 143 |
+
max_tokens=1500
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
final_result = response.strip() if response else "Não foi possível gerar resposta."
|
| 147 |
+
|
| 148 |
+
# Yield final result
|
| 149 |
+
yield {"type": "stream_end", "message": "Stream ended"}
|
| 150 |
+
|
| 151 |
+
yield {
|
| 152 |
+
"type": "result",
|
| 153 |
+
"content": final_result,
|
| 154 |
+
"links": [p["url"] for p in extracted_pages],
|
| 155 |
+
"success": True
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
elapsed = int(time.time() - start_time)
|
| 159 |
+
yield {"type": "complete", "message": f"Completed in {elapsed}s with {len(extracted_pages)} sources"}
|
| 160 |
+
|
| 161 |
+
logger.info(f"Agent complete. Sources: {len(extracted_pages)}, Time: {elapsed}s, LLM calls: 1")
|
| 162 |
+
|
| 163 |
+
except ImportError as e:
|
| 164 |
+
yield {"type": "error", "message": "e2b-desktop not installed"}
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.exception("Browser agent error")
|
| 167 |
+
yield {"type": "error", "message": f"Error: {str(e)}"}
|
| 168 |
+
finally:
|
| 169 |
+
if desktop:
|
| 170 |
+
try:
|
| 171 |
+
desktop.stream.stop()
|
| 172 |
+
except Exception:
|
| 173 |
+
pass
|
| 174 |
+
try:
|
| 175 |
+
desktop.kill()
|
| 176 |
+
except Exception:
|
| 177 |
+
pass
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
async def _extract_page_content(desktop, url: str) -> str:
|
| 181 |
+
"""Extract text content from a page using curl."""
|
| 182 |
+
try:
|
| 183 |
+
result = desktop.commands.run(
|
| 184 |
+
f"curl -sL --max-time 8 --connect-timeout 5 "
|
| 185 |
+
f"-A 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' "
|
| 186 |
+
f"{shlex.quote(url)} 2>/dev/null | "
|
| 187 |
+
"sed -e 's/<script[^>]*>.*<\\/script>//g' -e 's/<style[^>]*>.*<\\/style>//g' | "
|
| 188 |
+
"sed 's/<[^>]*>//g' | "
|
| 189 |
+
"tr -s ' \\n' ' ' | "
|
| 190 |
+
"head -c 8000",
|
| 191 |
+
timeout=12
|
| 192 |
+
)
|
| 193 |
+
return result.stdout.strip() if hasattr(result, 'stdout') else ""
|
| 194 |
+
except Exception as e:
|
| 195 |
+
logger.warning(f"Extract failed for {url}: {e}")
|
| 196 |
+
return ""
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def _extract_links_from_search(content: str, task: str) -> List[str]:
|
| 200 |
+
"""Extract relevant links from DuckDuckGo search results."""
|
| 201 |
+
# DuckDuckGo HTML links pattern
|
| 202 |
+
links = []
|
| 203 |
+
|
| 204 |
+
# Find URLs in the content
|
| 205 |
+
url_pattern = r'https?://[^\s<>"\']+[a-zA-Z0-9/]'
|
| 206 |
+
found_urls = re.findall(url_pattern, content)
|
| 207 |
+
|
| 208 |
+
# Filter out search engine URLs and duplicates
|
| 209 |
+
seen = set()
|
| 210 |
+
for url in found_urls:
|
| 211 |
+
# Clean URL
|
| 212 |
+
url = url.rstrip('.,;:)')
|
| 213 |
+
|
| 214 |
+
# Skip search engines, trackers, etc
|
| 215 |
+
skip_domains = ['duckduckgo.com', 'google.com', 'bing.com', 'facebook.com', 'twitter.com', 'instagram.com']
|
| 216 |
+
if any(d in url.lower() for d in skip_domains):
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
# Skip if already seen
|
| 220 |
+
domain = url.split('/')[2] if len(url.split('/')) > 2 else url
|
| 221 |
+
if domain in seen:
|
| 222 |
+
continue
|
| 223 |
+
seen.add(domain)
|
| 224 |
+
|
| 225 |
+
links.append(url)
|
| 226 |
+
|
| 227 |
+
if len(links) >= 8:
|
| 228 |
+
break
|
| 229 |
+
|
| 230 |
+
return links
|
app/agents/deep_research.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deep Research Orchestrator.
|
| 2 |
+
|
| 3 |
+
Coordinates the full deep research pipeline:
|
| 4 |
+
1. Planning (query decomposition)
|
| 5 |
+
2. Parallel searching (multiple dimensions)
|
| 6 |
+
3. Report synthesis
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import asyncio
|
| 10 |
+
import json
|
| 11 |
+
import time
|
| 12 |
+
from typing import AsyncIterator, Optional
|
| 13 |
+
|
| 14 |
+
from app.agents.planner import create_research_plan, ResearchPlan, ResearchDimension
|
| 15 |
+
from app.agents.llm_client import generate_completion_stream
|
| 16 |
+
from app.reranking.pipeline import rerank_results
|
| 17 |
+
from app.config import get_settings
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class DimensionResult:
|
| 21 |
+
"""Results from researching a single dimension."""
|
| 22 |
+
|
| 23 |
+
def __init__(self, dimension: ResearchDimension):
|
| 24 |
+
self.dimension = dimension
|
| 25 |
+
self.results: list[dict] = []
|
| 26 |
+
self.error: Optional[str] = None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
async def run_deep_research(
|
| 30 |
+
query: str,
|
| 31 |
+
max_dimensions: int = 6,
|
| 32 |
+
max_sources_per_dim: int = 5,
|
| 33 |
+
max_total_searches: int = 20,
|
| 34 |
+
) -> AsyncIterator[str]:
|
| 35 |
+
"""
|
| 36 |
+
Run a deep research pipeline with streaming progress.
|
| 37 |
+
|
| 38 |
+
Yields SSE-formatted events as the research progresses.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
query: The research query
|
| 42 |
+
max_dimensions: Maximum dimensions to research
|
| 43 |
+
max_sources_per_dim: Max results per dimension
|
| 44 |
+
max_total_searches: Total Tavily API calls allowed
|
| 45 |
+
|
| 46 |
+
Yields:
|
| 47 |
+
SSE event strings in format: data: {json}\n\n
|
| 48 |
+
"""
|
| 49 |
+
start_time = time.perf_counter()
|
| 50 |
+
settings = get_settings()
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
# === PHASE 1: PLANNING ===
|
| 54 |
+
yield _sse_event("status", {"phase": "planning", "message": "Analyzing query..."})
|
| 55 |
+
|
| 56 |
+
plan = await create_research_plan(query, max_dimensions)
|
| 57 |
+
|
| 58 |
+
yield _sse_event("plan_ready", {
|
| 59 |
+
"refined_query": plan.refined_query,
|
| 60 |
+
"dimensions": [
|
| 61 |
+
{"name": d.name, "description": d.description, "priority": d.priority}
|
| 62 |
+
for d in plan.dimensions
|
| 63 |
+
],
|
| 64 |
+
"estimated_sources": plan.estimated_sources,
|
| 65 |
+
})
|
| 66 |
+
|
| 67 |
+
# === PHASE 2: PARALLEL SEARCHING ===
|
| 68 |
+
yield _sse_event("status", {"phase": "searching", "message": "Researching dimensions..."})
|
| 69 |
+
|
| 70 |
+
# Distribute search budget across dimensions
|
| 71 |
+
num_dimensions = len(plan.dimensions)
|
| 72 |
+
searches_per_dim = max(1, max_total_searches // num_dimensions)
|
| 73 |
+
|
| 74 |
+
dimension_results: list[DimensionResult] = []
|
| 75 |
+
|
| 76 |
+
# Search dimensions in parallel batches
|
| 77 |
+
for i, dimension in enumerate(plan.dimensions):
|
| 78 |
+
yield _sse_event("dimension_start", {
|
| 79 |
+
"index": i + 1,
|
| 80 |
+
"total": num_dimensions,
|
| 81 |
+
"name": dimension.name,
|
| 82 |
+
"query": dimension.search_query,
|
| 83 |
+
})
|
| 84 |
+
|
| 85 |
+
# Search this dimension
|
| 86 |
+
result = await _search_dimension(
|
| 87 |
+
dimension=dimension,
|
| 88 |
+
max_results=max_sources_per_dim,
|
| 89 |
+
max_searches=searches_per_dim,
|
| 90 |
+
)
|
| 91 |
+
dimension_results.append(result)
|
| 92 |
+
|
| 93 |
+
yield _sse_event("dimension_complete", {
|
| 94 |
+
"index": i + 1,
|
| 95 |
+
"name": dimension.name,
|
| 96 |
+
"results_count": len(result.results),
|
| 97 |
+
"error": result.error,
|
| 98 |
+
})
|
| 99 |
+
|
| 100 |
+
# Small delay to avoid rate limits
|
| 101 |
+
await asyncio.sleep(0.1)
|
| 102 |
+
|
| 103 |
+
# === PHASE 3: SYNTHESIS ===
|
| 104 |
+
yield _sse_event("status", {"phase": "synthesizing", "message": "Generating report..."})
|
| 105 |
+
yield _sse_event("synthesis_start", {})
|
| 106 |
+
|
| 107 |
+
# Stream the report generation
|
| 108 |
+
async for chunk in _synthesize_report_stream(query, plan, dimension_results):
|
| 109 |
+
yield _sse_event("report_chunk", {"content": chunk})
|
| 110 |
+
|
| 111 |
+
# === COMPLETE ===
|
| 112 |
+
total_time = time.perf_counter() - start_time
|
| 113 |
+
total_sources = sum(len(r.results) for r in dimension_results)
|
| 114 |
+
|
| 115 |
+
yield _sse_event("done", {
|
| 116 |
+
"total_sources": total_sources,
|
| 117 |
+
"total_dimensions": num_dimensions,
|
| 118 |
+
"total_time_seconds": round(total_time, 2),
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
yield _sse_event("error", {"message": str(e)})
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
async def _search_dimension(
|
| 126 |
+
dimension: ResearchDimension,
|
| 127 |
+
max_results: int = 5,
|
| 128 |
+
max_searches: int = 2,
|
| 129 |
+
) -> DimensionResult:
|
| 130 |
+
"""Search a single dimension using the aggregator."""
|
| 131 |
+
from app.sources.aggregator import aggregate_search
|
| 132 |
+
|
| 133 |
+
result = DimensionResult(dimension)
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
# Use aggregator to search all sources
|
| 137 |
+
all_results = await aggregate_search(
|
| 138 |
+
query=dimension.search_query,
|
| 139 |
+
max_results=max_results + 3, # Get extra for reranking
|
| 140 |
+
include_wikipedia=True,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Light reranking (use embeddings when we have many results from SearXNG)
|
| 144 |
+
if all_results:
|
| 145 |
+
use_embeddings = len(all_results) > 15
|
| 146 |
+
ranked = await rerank_results(
|
| 147 |
+
query=dimension.search_query,
|
| 148 |
+
results=all_results,
|
| 149 |
+
temporal_urgency=0.5,
|
| 150 |
+
max_results=max_results,
|
| 151 |
+
use_embeddings=use_embeddings,
|
| 152 |
+
)
|
| 153 |
+
result.results = ranked
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
result.error = str(e)
|
| 157 |
+
|
| 158 |
+
return result
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
async def _synthesize_report_stream(
|
| 162 |
+
original_query: str,
|
| 163 |
+
plan: ResearchPlan,
|
| 164 |
+
dimension_results: list[DimensionResult],
|
| 165 |
+
) -> AsyncIterator[str]:
|
| 166 |
+
"""Stream the synthesis of the final report."""
|
| 167 |
+
|
| 168 |
+
# Build context from all dimension results
|
| 169 |
+
context_parts = []
|
| 170 |
+
all_sources = []
|
| 171 |
+
source_index = 1
|
| 172 |
+
|
| 173 |
+
for dr in dimension_results:
|
| 174 |
+
if dr.results:
|
| 175 |
+
context_parts.append(f"\n## {dr.dimension.name}\n")
|
| 176 |
+
for r in dr.results:
|
| 177 |
+
context_parts.append(
|
| 178 |
+
f"[{source_index}] {r.get('title', 'Untitled')}\n"
|
| 179 |
+
f" URL: {r.get('url', '')}\n"
|
| 180 |
+
f" Content: {r.get('content', '')[:400]}...\n"
|
| 181 |
+
)
|
| 182 |
+
all_sources.append({
|
| 183 |
+
"index": source_index,
|
| 184 |
+
"title": r.get("title", ""),
|
| 185 |
+
"url": r.get("url", ""),
|
| 186 |
+
})
|
| 187 |
+
source_index += 1
|
| 188 |
+
|
| 189 |
+
context = "\n".join(context_parts)
|
| 190 |
+
|
| 191 |
+
# Build synthesis prompt
|
| 192 |
+
prompt = f"""You are a research analyst. Create a comprehensive research report based on the gathered information.
|
| 193 |
+
|
| 194 |
+
ORIGINAL QUERY: {original_query}
|
| 195 |
+
REFINED QUERY: {plan.refined_query}
|
| 196 |
+
|
| 197 |
+
RESEARCH DIMENSIONS:
|
| 198 |
+
{', '.join(d.name for d in plan.dimensions)}
|
| 199 |
+
|
| 200 |
+
GATHERED INFORMATION:
|
| 201 |
+
{context}
|
| 202 |
+
|
| 203 |
+
INSTRUCTIONS:
|
| 204 |
+
1. Write a comprehensive research report in Markdown format
|
| 205 |
+
2. Start with an Executive Summary (2-3 paragraphs)
|
| 206 |
+
3. Create a section for each research dimension
|
| 207 |
+
4. Use citations [1], [2], etc. to reference sources
|
| 208 |
+
5. Include a Conclusion section
|
| 209 |
+
6. Be thorough but concise
|
| 210 |
+
7. Write in the same language as the query
|
| 211 |
+
8. Use headers (##) to organize sections
|
| 212 |
+
|
| 213 |
+
Generate the report:"""
|
| 214 |
+
|
| 215 |
+
messages = [
|
| 216 |
+
{"role": "system", "content": "You are a research analyst creating detailed reports."},
|
| 217 |
+
{"role": "user", "content": prompt},
|
| 218 |
+
]
|
| 219 |
+
|
| 220 |
+
try:
|
| 221 |
+
async for chunk in generate_completion_stream(messages, temperature=0.4):
|
| 222 |
+
yield chunk
|
| 223 |
+
|
| 224 |
+
# Append sources at the end
|
| 225 |
+
yield "\n\n---\n\n## Sources\n\n"
|
| 226 |
+
for src in all_sources:
|
| 227 |
+
yield f"[{src['index']}] [{src['title']}]({src['url']})\n"
|
| 228 |
+
|
| 229 |
+
except Exception as e:
|
| 230 |
+
yield f"\n\n**Error generating report:** {e}"
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def _sse_event(event_type: str, data: dict) -> str:
|
| 234 |
+
"""Format an SSE event."""
|
| 235 |
+
payload = {"type": event_type, **data}
|
| 236 |
+
return f"data: {json.dumps(payload)}\n\n"
|
app/agents/flaresolverr.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FlareSolverr client for Cloudflare bypass.
|
| 2 |
+
|
| 3 |
+
FlareSolverr uses undetected-chromedriver to solve Cloudflare challenges.
|
| 4 |
+
Must be running at http://localhost:8191 in the E2B sandbox.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
import json
|
| 9 |
+
import shlex
|
| 10 |
+
from typing import Optional, Tuple
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
FLARESOLVERR_URL = "http://localhost:8191/v1"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
async def solve_cloudflare(desktop, url: str, timeout: int = 60) -> Tuple[bool, str]:
|
| 18 |
+
"""
|
| 19 |
+
Use FlareSolverr to bypass Cloudflare protection.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
desktop: E2B desktop instance
|
| 23 |
+
url: URL to fetch through FlareSolverr
|
| 24 |
+
timeout: Max seconds to wait for solution
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
(success: bool, content: str)
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
# Make request to FlareSolverr - properly escape the JSON payload
|
| 31 |
+
payload = json.dumps({
|
| 32 |
+
"cmd": "request.get",
|
| 33 |
+
"url": url,
|
| 34 |
+
"maxTimeout": timeout * 1000
|
| 35 |
+
})
|
| 36 |
+
|
| 37 |
+
result = desktop.commands.run(
|
| 38 |
+
f"curl -s -X POST {shlex.quote(FLARESOLVERR_URL)} "
|
| 39 |
+
f"-H 'Content-Type: application/json' "
|
| 40 |
+
f"-d {shlex.quote(payload)} 2>/dev/null",
|
| 41 |
+
timeout=timeout + 10
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
if not hasattr(result, 'stdout') or not result.stdout:
|
| 45 |
+
return False, ""
|
| 46 |
+
|
| 47 |
+
response = json.loads(result.stdout)
|
| 48 |
+
|
| 49 |
+
if response.get("status") == "ok":
|
| 50 |
+
solution = response.get("solution", {})
|
| 51 |
+
html = solution.get("response", "")
|
| 52 |
+
|
| 53 |
+
# Strip HTML tags - use base64 to safely pass content
|
| 54 |
+
if html:
|
| 55 |
+
import base64
|
| 56 |
+
html_b64 = base64.b64encode(html[:10000].encode()).decode()
|
| 57 |
+
clean_result = desktop.commands.run(
|
| 58 |
+
f"echo {shlex.quote(html_b64)} | base64 -d | sed 's/<[^>]*>//g' | tr -s ' \\n' ' ' | head -c 6000",
|
| 59 |
+
timeout=5
|
| 60 |
+
)
|
| 61 |
+
content = clean_result.stdout.strip() if hasattr(clean_result, 'stdout') else html[:6000]
|
| 62 |
+
logger.info(f"FlareSolverr solved: {url[:50]}")
|
| 63 |
+
return True, content
|
| 64 |
+
|
| 65 |
+
logger.warning(f"FlareSolverr failed: {response.get('message', 'unknown')}")
|
| 66 |
+
return False, ""
|
| 67 |
+
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logger.warning(f"FlareSolverr error: {e}")
|
| 70 |
+
return False, ""
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def is_cloudflare_blocked(content: str) -> bool:
|
| 74 |
+
"""Check if page content indicates Cloudflare block.
|
| 75 |
+
|
| 76 |
+
Only returns True for actual Cloudflare challenge pages,
|
| 77 |
+
not just pages that mention Cloudflare.
|
| 78 |
+
"""
|
| 79 |
+
content_lower = content.lower()
|
| 80 |
+
|
| 81 |
+
# Must have multiple strong indicators to be considered blocked
|
| 82 |
+
strong_indicators = [
|
| 83 |
+
"checking your browser before accessing",
|
| 84 |
+
"please wait while we verify",
|
| 85 |
+
"ray id:",
|
| 86 |
+
"cloudflare ray id",
|
| 87 |
+
"enable javascript and cookies",
|
| 88 |
+
"attention required! | cloudflare",
|
| 89 |
+
"just a moment...",
|
| 90 |
+
"ddos protection by cloudflare",
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
# Check for strong indicators (need at least 1)
|
| 94 |
+
has_strong = any(ind in content_lower for ind in strong_indicators)
|
| 95 |
+
|
| 96 |
+
# Also check if content is suspiciously short (challenge pages are small)
|
| 97 |
+
is_short = len(content) < 500
|
| 98 |
+
|
| 99 |
+
# Only block if we have strong indicator AND page is short
|
| 100 |
+
# (real content pages that mention cloudflare will be longer)
|
| 101 |
+
if has_strong and is_short:
|
| 102 |
+
return True
|
| 103 |
+
|
| 104 |
+
# Very specific patterns that are definitely challenge pages
|
| 105 |
+
definite_blocks = [
|
| 106 |
+
"checking if the site connection is secure",
|
| 107 |
+
"please turn javascript on and reload the page",
|
| 108 |
+
"please enable cookies",
|
| 109 |
+
]
|
| 110 |
+
|
| 111 |
+
return any(block in content_lower for block in definite_blocks)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def is_login_wall(content: str) -> bool:
|
| 115 |
+
"""Check if page requires login."""
|
| 116 |
+
login_indicators = [
|
| 117 |
+
"sign in",
|
| 118 |
+
"log in",
|
| 119 |
+
"login",
|
| 120 |
+
"create account",
|
| 121 |
+
"register",
|
| 122 |
+
"enter your password",
|
| 123 |
+
"authentication required",
|
| 124 |
+
]
|
| 125 |
+
|
| 126 |
+
content_lower = content.lower()
|
| 127 |
+
# Check for login indicators but make sure it's not just a login link
|
| 128 |
+
return sum(1 for ind in login_indicators if ind in content_lower) >= 2
|
app/agents/graph/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Agent Graph Package
|
app/agents/graph/nodes.py
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Graph nodes for the agent execution.
|
| 2 |
+
|
| 3 |
+
Each node represents a step in the agent's decision process:
|
| 4 |
+
- PlanNode: Decomposes the task into subtasks
|
| 5 |
+
- SearchNode: Performs web searches
|
| 6 |
+
- NavigateNode: Navigates to URLs
|
| 7 |
+
- ExtractNode: Extracts content from pages
|
| 8 |
+
- VerifyNode: Verifies if goal is achieved
|
| 9 |
+
- RespondNode: Generates final response
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import logging
|
| 14 |
+
import shlex
|
| 15 |
+
import base64
|
| 16 |
+
from abc import ABC, abstractmethod
|
| 17 |
+
from typing import Tuple
|
| 18 |
+
|
| 19 |
+
from app.agents.graph.state import AgentState, NodeType
|
| 20 |
+
from app.agents.llm_client import generate_completion
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class BaseNode(ABC):
|
| 26 |
+
"""Base class for all graph nodes."""
|
| 27 |
+
|
| 28 |
+
node_type: NodeType = NodeType.START
|
| 29 |
+
|
| 30 |
+
@abstractmethod
|
| 31 |
+
async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
|
| 32 |
+
"""Execute the node logic and return updated state + next node."""
|
| 33 |
+
pass
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class PlanNode(BaseNode):
|
| 37 |
+
"""Decomposes task into subtasks."""
|
| 38 |
+
|
| 39 |
+
node_type = NodeType.PLAN
|
| 40 |
+
|
| 41 |
+
async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
|
| 42 |
+
prompt = f"""Você é um planejador de tarefas. Decomponha a tarefa em passos simples.
|
| 43 |
+
|
| 44 |
+
TAREFA: {state.task}
|
| 45 |
+
URL inicial: {state.url or 'Nenhuma - começar com busca'}
|
| 46 |
+
|
| 47 |
+
Responda com JSON:
|
| 48 |
+
{{
|
| 49 |
+
"goal": "objetivo principal",
|
| 50 |
+
"steps": [
|
| 51 |
+
{{"action": "search", "query": "termos de busca"}},
|
| 52 |
+
{{"action": "navigate", "description": "onde navegar"}},
|
| 53 |
+
{{"action": "extract", "what": "o que extrair"}}
|
| 54 |
+
],
|
| 55 |
+
"success_criteria": "critério de sucesso"
|
| 56 |
+
}}
|
| 57 |
+
|
| 58 |
+
Responda APENAS o JSON, sem explicação."""
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
response = await generate_completion(
|
| 62 |
+
messages=[{"role": "user", "content": prompt}],
|
| 63 |
+
max_tokens=500
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Parse JSON
|
| 67 |
+
response = response.strip()
|
| 68 |
+
if response.startswith("```"):
|
| 69 |
+
response = response.split("```")[1]
|
| 70 |
+
if response.startswith("json"):
|
| 71 |
+
response = response[4:]
|
| 72 |
+
|
| 73 |
+
plan = json.loads(response)
|
| 74 |
+
state.plan = plan
|
| 75 |
+
logger.info(f"Plan created: {plan.get('goal', 'No goal')}")
|
| 76 |
+
|
| 77 |
+
# Decide next node based on plan
|
| 78 |
+
if plan.get("steps") and plan["steps"][0].get("action") == "navigate" and state.url:
|
| 79 |
+
return state, NodeType.NAVIGATE
|
| 80 |
+
return state, NodeType.SEARCH
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.error(f"Planning failed: {e}")
|
| 84 |
+
state.add_error(f"Planning failed: {e}")
|
| 85 |
+
# Fallback to search
|
| 86 |
+
state.plan = {"goal": state.task, "steps": [{"action": "search", "query": state.task}]}
|
| 87 |
+
return state, NodeType.SEARCH
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class SearchNode(BaseNode):
|
| 91 |
+
"""Performs web search."""
|
| 92 |
+
|
| 93 |
+
node_type = NodeType.SEARCH
|
| 94 |
+
|
| 95 |
+
async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
|
| 96 |
+
desktop = state.desktop
|
| 97 |
+
|
| 98 |
+
# Determine search query
|
| 99 |
+
query = state.task
|
| 100 |
+
if state.plan.get("steps"):
|
| 101 |
+
for step in state.plan["steps"]:
|
| 102 |
+
if step.get("action") == "search" and step.get("query"):
|
| 103 |
+
query = step["query"]
|
| 104 |
+
break
|
| 105 |
+
|
| 106 |
+
# Execute search
|
| 107 |
+
search_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
desktop.commands.run(f"google-chrome {shlex.quote(search_url)} &", background=True)
|
| 111 |
+
state.visited_urls.append(search_url)
|
| 112 |
+
desktop.wait(3000)
|
| 113 |
+
|
| 114 |
+
state.add_action({"type": "search", "query": query})
|
| 115 |
+
logger.info(f"Searched: {query}")
|
| 116 |
+
|
| 117 |
+
return state, NodeType.EXTRACT
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
state.add_error(f"Search failed: {e}")
|
| 121 |
+
return state, NodeType.VERIFY
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class NavigateNode(BaseNode):
|
| 125 |
+
"""Navigates to a URL."""
|
| 126 |
+
|
| 127 |
+
node_type = NodeType.NAVIGATE
|
| 128 |
+
|
| 129 |
+
async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
|
| 130 |
+
desktop = state.desktop
|
| 131 |
+
|
| 132 |
+
# Get URL to navigate
|
| 133 |
+
url = state.url
|
| 134 |
+
if not url and state.extracted_data:
|
| 135 |
+
# Try to get URL from extracted links
|
| 136 |
+
last_data = state.extracted_data[-1]
|
| 137 |
+
if "links" in last_data.get("data", {}):
|
| 138 |
+
links = last_data["data"]["links"]
|
| 139 |
+
if links:
|
| 140 |
+
url = links[0]
|
| 141 |
+
|
| 142 |
+
if not url:
|
| 143 |
+
return state, NodeType.SEARCH
|
| 144 |
+
|
| 145 |
+
try:
|
| 146 |
+
desktop.commands.run(f"google-chrome {shlex.quote(url)} &", background=True)
|
| 147 |
+
if url not in state.visited_urls:
|
| 148 |
+
state.visited_urls.append(url)
|
| 149 |
+
desktop.wait(3000)
|
| 150 |
+
|
| 151 |
+
state.add_action({"type": "navigate", "url": url})
|
| 152 |
+
logger.info(f"Navigated to: {url[:50]}")
|
| 153 |
+
|
| 154 |
+
return state, NodeType.EXTRACT
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
state.add_error(f"Navigation failed: {e}")
|
| 158 |
+
return state, NodeType.SEARCH
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
class ExtractNode(BaseNode):
|
| 162 |
+
"""Extracts content from current page."""
|
| 163 |
+
|
| 164 |
+
node_type = NodeType.EXTRACT
|
| 165 |
+
|
| 166 |
+
async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
|
| 167 |
+
desktop = state.desktop
|
| 168 |
+
current_url = state.visited_urls[-1] if state.visited_urls else ""
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
# Get window title
|
| 172 |
+
result = desktop.commands.run("xdotool getactivewindow getwindowname 2>/dev/null", timeout=5)
|
| 173 |
+
state.window_title = result.stdout.strip() if hasattr(result, 'stdout') else ""
|
| 174 |
+
|
| 175 |
+
# Extract page content via curl
|
| 176 |
+
if current_url.startswith("http"):
|
| 177 |
+
result = desktop.commands.run(
|
| 178 |
+
f"curl -sL --max-time 10 --connect-timeout 5 "
|
| 179 |
+
f"-A 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' "
|
| 180 |
+
f"'{current_url}' 2>/dev/null | "
|
| 181 |
+
"sed -e 's/<script[^>]*>.*<\\/script>//g' -e 's/<style[^>]*>.*<\\/style>//g' | "
|
| 182 |
+
"sed 's/<[^>]*>//g' | "
|
| 183 |
+
"tr -s ' \\n' ' ' | "
|
| 184 |
+
"head -c 6000",
|
| 185 |
+
timeout=15
|
| 186 |
+
)
|
| 187 |
+
state.page_content = result.stdout.strip() if hasattr(result, 'stdout') else ""
|
| 188 |
+
|
| 189 |
+
state.add_action({"type": "extract", "content_length": len(state.page_content)})
|
| 190 |
+
logger.info(f"Extracted {len(state.page_content)} chars from {current_url[:50]}")
|
| 191 |
+
|
| 192 |
+
return state, NodeType.VERIFY
|
| 193 |
+
|
| 194 |
+
except Exception as e:
|
| 195 |
+
state.add_error(f"Extraction failed: {e}")
|
| 196 |
+
return state, NodeType.VERIFY
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
class VerifyNode(BaseNode):
|
| 200 |
+
"""Verifies if goal is achieved and decides next action."""
|
| 201 |
+
|
| 202 |
+
node_type = NodeType.VERIFY
|
| 203 |
+
|
| 204 |
+
async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
|
| 205 |
+
context = state.get_context_for_llm()
|
| 206 |
+
page_preview = state.page_content[:4000] if state.page_content else "(No content)"
|
| 207 |
+
|
| 208 |
+
prompt = f"""Você é um agente de navegação web. Analise o conteúdo e decida o próximo passo.
|
| 209 |
+
|
| 210 |
+
TAREFA: {state.task}
|
| 211 |
+
PLANO: {state.plan.get('goal', 'Nenhum')}
|
| 212 |
+
CRITÉRIO DE SUCESSO: {state.plan.get('success_criteria', 'Encontrar a informação pedida')}
|
| 213 |
+
|
| 214 |
+
HISTÓRICO:
|
| 215 |
+
{context}
|
| 216 |
+
|
| 217 |
+
CONTEÚDO DA PÁGINA ATUAL:
|
| 218 |
+
{page_preview}
|
| 219 |
+
|
| 220 |
+
TEMPO RESTANTE: {int(state.get_remaining_time())}s
|
| 221 |
+
|
| 222 |
+
Decida:
|
| 223 |
+
1. Se encontrou a resposta, retorne: {{"status": "complete", "result": "Sua resposta formatada com **negrito** para valores importantes"}}
|
| 224 |
+
2. Se precisa buscar mais, retorne: {{"action": "search", "query": "nova busca"}}
|
| 225 |
+
3. Se precisa navegar para um link, retorne: {{"action": "navigate", "url": "https://..."}}
|
| 226 |
+
4. Se precisa rolar a página, retorne: {{"action": "scroll"}}
|
| 227 |
+
|
| 228 |
+
REGRAS:
|
| 229 |
+
- Use **negrito** para preços e valores importantes
|
| 230 |
+
- Cite as fontes
|
| 231 |
+
- Se página pede login, tente outra fonte
|
| 232 |
+
- Seja eficiente
|
| 233 |
+
|
| 234 |
+
Responda APENAS com JSON válido."""
|
| 235 |
+
|
| 236 |
+
try:
|
| 237 |
+
response = await generate_completion(
|
| 238 |
+
messages=[{"role": "user", "content": prompt}],
|
| 239 |
+
max_tokens=800
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
# Parse response
|
| 243 |
+
response = response.strip()
|
| 244 |
+
if response.startswith("```"):
|
| 245 |
+
response = response.split("```")[1]
|
| 246 |
+
if response.startswith("json"):
|
| 247 |
+
response = response[4:]
|
| 248 |
+
|
| 249 |
+
decision = json.loads(response)
|
| 250 |
+
state.add_action({"type": "verify", "decision": decision})
|
| 251 |
+
|
| 252 |
+
# Route based on decision
|
| 253 |
+
if decision.get("status") == "complete":
|
| 254 |
+
state.final_result = decision.get("result", "")
|
| 255 |
+
state.success = True
|
| 256 |
+
logger.info("Goal achieved!")
|
| 257 |
+
return state, NodeType.RESPOND
|
| 258 |
+
|
| 259 |
+
action = decision.get("action", "")
|
| 260 |
+
if action == "search":
|
| 261 |
+
# Update plan with new search
|
| 262 |
+
state.plan["steps"] = [{"action": "search", "query": decision.get("query", state.task)}]
|
| 263 |
+
return state, NodeType.SEARCH
|
| 264 |
+
elif action == "navigate":
|
| 265 |
+
state.url = decision.get("url", "")
|
| 266 |
+
return state, NodeType.NAVIGATE
|
| 267 |
+
elif action == "scroll":
|
| 268 |
+
state.desktop.scroll(-3)
|
| 269 |
+
state.desktop.wait(1000)
|
| 270 |
+
return state, NodeType.EXTRACT
|
| 271 |
+
|
| 272 |
+
# Default: try another search
|
| 273 |
+
return state, NodeType.SEARCH
|
| 274 |
+
|
| 275 |
+
except Exception as e:
|
| 276 |
+
logger.error(f"Verify failed: {e}")
|
| 277 |
+
state.add_error(f"Verify failed: {e}")
|
| 278 |
+
|
| 279 |
+
# If we have some content, try to respond anyway
|
| 280 |
+
if state.get_remaining_time() < 30:
|
| 281 |
+
return state, NodeType.RESPOND
|
| 282 |
+
return state, NodeType.SEARCH
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
class RespondNode(BaseNode):
|
| 286 |
+
"""Generates final response."""
|
| 287 |
+
|
| 288 |
+
node_type = NodeType.RESPOND
|
| 289 |
+
|
| 290 |
+
async def execute(self, state: AgentState) -> Tuple[AgentState, NodeType]:
|
| 291 |
+
# If we already have a result, we're done
|
| 292 |
+
if state.final_result:
|
| 293 |
+
state.success = True
|
| 294 |
+
return state, NodeType.RESPOND
|
| 295 |
+
|
| 296 |
+
# Generate response from collected data
|
| 297 |
+
context = state.get_context_for_llm()
|
| 298 |
+
page_content = state.page_content[:3000] if state.page_content else "(Nenhum conteúdo extraído)"
|
| 299 |
+
|
| 300 |
+
prompt = f"""Você realizou uma tarefa de navegação web. Sintetize os resultados.
|
| 301 |
+
|
| 302 |
+
TAREFA: {state.task}
|
| 303 |
+
|
| 304 |
+
DADOS COLETADOS:
|
| 305 |
+
{context}
|
| 306 |
+
|
| 307 |
+
ÚLTIMO CONTEÚDO DA PÁGINA:
|
| 308 |
+
{page_content}
|
| 309 |
+
|
| 310 |
+
URLs VISITADAS:
|
| 311 |
+
{chr(10).join(state.visited_urls[:5]) if state.visited_urls else '(Nenhuma)'}
|
| 312 |
+
|
| 313 |
+
INSTRUÇÕES:
|
| 314 |
+
- Gere uma resposta útil baseada no que foi encontrado
|
| 315 |
+
- Use **negrito** para valores importantes (preços, números, nomes)
|
| 316 |
+
- Cite as fontes quando possível
|
| 317 |
+
- Se não encontrou o que foi pedido, explique o que encontrou ou diga honestamente que não encontrou
|
| 318 |
+
|
| 319 |
+
Responda em português de forma clara e organizada."""
|
| 320 |
+
|
| 321 |
+
try:
|
| 322 |
+
response = await generate_completion(
|
| 323 |
+
messages=[{"role": "user", "content": prompt}],
|
| 324 |
+
max_tokens=1000
|
| 325 |
+
)
|
| 326 |
+
state.final_result = response.strip()
|
| 327 |
+
state.success = bool(state.final_result)
|
| 328 |
+
logger.info(f"Generated response: {len(state.final_result)} chars")
|
| 329 |
+
|
| 330 |
+
except Exception as e:
|
| 331 |
+
logger.error(f"Response generation failed: {e}")
|
| 332 |
+
# Fallback: create response from available data
|
| 333 |
+
if state.page_content:
|
| 334 |
+
state.final_result = f"**Informação encontrada:**\n\n{state.page_content[:500]}...\n\n*Fonte: {state.visited_urls[-1] if state.visited_urls else 'desconhecida'}*"
|
| 335 |
+
else:
|
| 336 |
+
state.final_result = f"Não foi possível completar a tarefa. Erro: {e}"
|
| 337 |
+
|
| 338 |
+
return state, NodeType.RESPOND
|
app/agents/graph/runner.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Graph runner - executes the agent graph.
|
| 2 |
+
|
| 3 |
+
The runner orchestrates node execution, manages state transitions,
|
| 4 |
+
and yields status updates for streaming.
|
| 5 |
+
|
| 6 |
+
Uses timeout-based execution instead of fixed iteration count.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
import time
|
| 11 |
+
from typing import AsyncGenerator, Dict, Type
|
| 12 |
+
|
| 13 |
+
from app.agents.graph.state import AgentState, NodeType
|
| 14 |
+
from app.agents.graph.nodes import (
|
| 15 |
+
BaseNode,
|
| 16 |
+
PlanNode,
|
| 17 |
+
SearchNode,
|
| 18 |
+
NavigateNode,
|
| 19 |
+
ExtractNode,
|
| 20 |
+
VerifyNode,
|
| 21 |
+
RespondNode,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
# Node registry
|
| 27 |
+
NODE_REGISTRY: Dict[NodeType, Type[BaseNode]] = {
|
| 28 |
+
NodeType.PLAN: PlanNode,
|
| 29 |
+
NodeType.SEARCH: SearchNode,
|
| 30 |
+
NodeType.NAVIGATE: NavigateNode,
|
| 31 |
+
NodeType.EXTRACT: ExtractNode,
|
| 32 |
+
NodeType.VERIFY: VerifyNode,
|
| 33 |
+
NodeType.RESPOND: RespondNode,
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# Status messages with emojis
|
| 37 |
+
STATUS_MESSAGES = {
|
| 38 |
+
NodeType.PLAN: "🎯 Planning task...",
|
| 39 |
+
NodeType.SEARCH: "🔍 Searching...",
|
| 40 |
+
NodeType.NAVIGATE: "🌐 Navigating...",
|
| 41 |
+
NodeType.EXTRACT: "📊 Extracting content...",
|
| 42 |
+
NodeType.VERIFY: "🤔 Analyzing...",
|
| 43 |
+
NodeType.RESPOND: "✅ Generating response...",
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
async def run_graph(state: AgentState) -> AsyncGenerator[dict, None]:
|
| 48 |
+
"""Run the agent graph and yield status updates.
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
state: Initial agent state with task, url, and desktop
|
| 52 |
+
|
| 53 |
+
Yields:
|
| 54 |
+
Status updates and final result
|
| 55 |
+
"""
|
| 56 |
+
# Initialize timing
|
| 57 |
+
state.start_time = time.time()
|
| 58 |
+
current_node_type = NodeType.PLAN
|
| 59 |
+
state.current_node = current_node_type
|
| 60 |
+
|
| 61 |
+
logger.info(f"Starting graph execution for task: {state.task[:50]}, timeout: {state.timeout_seconds}s")
|
| 62 |
+
|
| 63 |
+
while state.should_continue():
|
| 64 |
+
state.step_count += 1
|
| 65 |
+
state.current_node = current_node_type
|
| 66 |
+
|
| 67 |
+
# Get node instance
|
| 68 |
+
node_class = NODE_REGISTRY.get(current_node_type)
|
| 69 |
+
if not node_class:
|
| 70 |
+
logger.error(f"Unknown node type: {current_node_type}")
|
| 71 |
+
break
|
| 72 |
+
|
| 73 |
+
node = node_class()
|
| 74 |
+
|
| 75 |
+
# Calculate remaining time
|
| 76 |
+
remaining = int(state.get_remaining_time())
|
| 77 |
+
elapsed = int(state.get_elapsed_time())
|
| 78 |
+
|
| 79 |
+
# Yield status update
|
| 80 |
+
status_msg = STATUS_MESSAGES.get(current_node_type, "Processing...")
|
| 81 |
+
if current_node_type == NodeType.SEARCH and state.plan.get("steps"):
|
| 82 |
+
for step in state.plan["steps"]:
|
| 83 |
+
if step.get("action") == "search":
|
| 84 |
+
status_msg = f"🔍 Searching: {step.get('query', state.task)[:40]}..."
|
| 85 |
+
break
|
| 86 |
+
elif current_node_type == NodeType.NAVIGATE and state.url:
|
| 87 |
+
status_msg = f"🌐 Navigating to {state.url[:40]}..."
|
| 88 |
+
|
| 89 |
+
yield {
|
| 90 |
+
"type": "status",
|
| 91 |
+
"message": f"{status_msg} (step {state.step_count}, {remaining}s remaining)"
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# Execute node
|
| 95 |
+
try:
|
| 96 |
+
state, next_node_type = await node.execute(state)
|
| 97 |
+
logger.info(f"Step {state.step_count}: {current_node_type.value} -> {next_node_type.value} ({elapsed}s elapsed)")
|
| 98 |
+
|
| 99 |
+
# Check if we're done
|
| 100 |
+
if current_node_type == NodeType.RESPOND:
|
| 101 |
+
break
|
| 102 |
+
|
| 103 |
+
# Transition to next node
|
| 104 |
+
current_node_type = next_node_type
|
| 105 |
+
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logger.exception(f"Node execution failed: {e}")
|
| 108 |
+
state.add_error(str(e))
|
| 109 |
+
|
| 110 |
+
# If running low on time, try to respond
|
| 111 |
+
if state.get_remaining_time() < 30:
|
| 112 |
+
current_node_type = NodeType.RESPOND
|
| 113 |
+
else:
|
| 114 |
+
current_node_type = NodeType.SEARCH
|
| 115 |
+
|
| 116 |
+
# If we timed out without a result, generate one from what we have
|
| 117 |
+
if not state.final_result and not state.success:
|
| 118 |
+
logger.warning("Timeout reached, forcing response generation")
|
| 119 |
+
respond_node = RespondNode()
|
| 120 |
+
state, _ = await respond_node.execute(state)
|
| 121 |
+
|
| 122 |
+
# Yield final result
|
| 123 |
+
yield {
|
| 124 |
+
"type": "result",
|
| 125 |
+
"content": state.final_result,
|
| 126 |
+
"links": state.visited_urls[:10],
|
| 127 |
+
"success": state.success
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
yield {"type": "complete", "message": f"Task completed in {int(state.get_elapsed_time())}s"}
|
| 131 |
+
|
| 132 |
+
logger.info(f"Graph execution complete. Success: {state.success}, Steps: {state.step_count}, Time: {state.get_elapsed_time():.1f}s")
|
| 133 |
+
|
app/agents/graph/simple_agent.py
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Simplified agent nodes - ONE LLM call per cycle.
|
| 2 |
+
|
| 3 |
+
DAG:
|
| 4 |
+
START → THINK_ACT ←→ EXECUTE → RESPOND
|
| 5 |
+
↑______________|
|
| 6 |
+
|
| 7 |
+
ThinkAndAct: Analyzes content + decides action in ONE call
|
| 8 |
+
Execute: Runs the action (search, navigate, scroll) - NO LLM
|
| 9 |
+
Respond: Final synthesis
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import logging
|
| 14 |
+
import shlex
|
| 15 |
+
import time
|
| 16 |
+
from abc import ABC, abstractmethod
|
| 17 |
+
from typing import Tuple, Optional, List
|
| 18 |
+
|
| 19 |
+
from app.agents.llm_client import generate_completion
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class SimpleState:
|
| 25 |
+
"""Minimal state for the agent."""
|
| 26 |
+
|
| 27 |
+
def __init__(self, task: str, url: Optional[str], desktop, timeout: float = 300):
|
| 28 |
+
self.task = task
|
| 29 |
+
self.url = url
|
| 30 |
+
self.desktop = desktop
|
| 31 |
+
self.timeout = timeout
|
| 32 |
+
self.start_time = time.time()
|
| 33 |
+
|
| 34 |
+
# Memory - content cache (URL -> content)
|
| 35 |
+
self.content_cache: dict = {} # {url: content}
|
| 36 |
+
self.visited_urls: List[str] = []
|
| 37 |
+
self.action_history: List[str] = []
|
| 38 |
+
|
| 39 |
+
# Accumulated knowledge
|
| 40 |
+
self.findings: List[str] = [] # Key findings extracted
|
| 41 |
+
|
| 42 |
+
# Result
|
| 43 |
+
self.final_result = ""
|
| 44 |
+
self.done = False
|
| 45 |
+
|
| 46 |
+
def elapsed(self) -> float:
|
| 47 |
+
return time.time() - self.start_time
|
| 48 |
+
|
| 49 |
+
def remaining(self) -> float:
|
| 50 |
+
return max(0, self.timeout - self.elapsed())
|
| 51 |
+
|
| 52 |
+
def should_continue(self) -> bool:
|
| 53 |
+
return not self.done and self.remaining() > 20
|
| 54 |
+
|
| 55 |
+
def add_page(self, url: str, content: str):
|
| 56 |
+
"""Add page to cache - no duplicate fetching."""
|
| 57 |
+
if url not in self.content_cache:
|
| 58 |
+
self.content_cache[url] = content[:4000]
|
| 59 |
+
if url not in self.visited_urls:
|
| 60 |
+
self.visited_urls.append(url)
|
| 61 |
+
|
| 62 |
+
def get_cached_content(self, url: str) -> Optional[str]:
|
| 63 |
+
"""Get content from cache if available."""
|
| 64 |
+
return self.content_cache.get(url)
|
| 65 |
+
|
| 66 |
+
def add_finding(self, finding: str):
|
| 67 |
+
"""Add a key finding to memory."""
|
| 68 |
+
if finding and finding not in self.findings:
|
| 69 |
+
self.findings.append(finding)
|
| 70 |
+
|
| 71 |
+
def get_all_content(self) -> str:
|
| 72 |
+
"""Get all cached content for final synthesis."""
|
| 73 |
+
parts = []
|
| 74 |
+
for url in self.visited_urls[-5:]:
|
| 75 |
+
content = self.content_cache.get(url, "")
|
| 76 |
+
if content:
|
| 77 |
+
parts.append(f"[{url[:60]}]\n{content[:1500]}")
|
| 78 |
+
return "\n\n---\n\n".join(parts)
|
| 79 |
+
|
| 80 |
+
def get_recent_content(self) -> str:
|
| 81 |
+
"""Get last 2 pages content for context."""
|
| 82 |
+
recent_urls = self.visited_urls[-2:] if self.visited_urls else []
|
| 83 |
+
parts = []
|
| 84 |
+
for url in recent_urls:
|
| 85 |
+
content = self.content_cache.get(url, "")
|
| 86 |
+
if content:
|
| 87 |
+
parts.append(f"[{url[:60]}]\n{content[:2000]}")
|
| 88 |
+
return "\n\n---\n\n".join(parts)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
async def think_and_act(state: SimpleState) -> Tuple[str, dict]:
|
| 92 |
+
"""
|
| 93 |
+
ONE LLM call that analyzes current state and decides next action.
|
| 94 |
+
Returns: (action_type, action_params)
|
| 95 |
+
|
| 96 |
+
Actions:
|
| 97 |
+
- search: {"query": "..."}
|
| 98 |
+
- navigate: {"url": "..."}
|
| 99 |
+
- scroll: {}
|
| 100 |
+
- complete: {"result": "..."}
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
+
content = state.get_recent_content() or "(No content yet)"
|
| 104 |
+
history = ", ".join(state.action_history[-5:]) if state.action_history else "(starting)"
|
| 105 |
+
|
| 106 |
+
# Memory: show visited URLs so LLM doesn't repeat
|
| 107 |
+
visited = "\n".join([f" - {u[:70]}" for u in state.visited_urls[-10:]]) if state.visited_urls else "(none)"
|
| 108 |
+
|
| 109 |
+
prompt = f"""You are a web research agent. Analyze the current state and decide your next action.
|
| 110 |
+
|
| 111 |
+
TASK: {state.task}
|
| 112 |
+
|
| 113 |
+
ALREADY VISITED (DO NOT visit again):
|
| 114 |
+
{visited}
|
| 115 |
+
|
| 116 |
+
CURRENT PAGE CONTENT:
|
| 117 |
+
{content}
|
| 118 |
+
|
| 119 |
+
HISTORY: {history}
|
| 120 |
+
TIME REMAINING: {int(state.remaining())}s
|
| 121 |
+
|
| 122 |
+
Decide ONE action. Return JSON:
|
| 123 |
+
|
| 124 |
+
If you need to search: {{"action": "search", "query": "search terms"}}
|
| 125 |
+
If you found a NEW relevant link to visit: {{"action": "navigate", "url": "https://..."}}
|
| 126 |
+
If you need to scroll for more content: {{"action": "scroll"}}
|
| 127 |
+
If you have enough info to answer: {{"action": "complete", "result": "Your answer with **bold** for important values. Cite sources."}}
|
| 128 |
+
|
| 129 |
+
RULES:
|
| 130 |
+
- DO NOT navigate to URLs already in "ALREADY VISITED" list
|
| 131 |
+
- Only use URLs you see in the content above
|
| 132 |
+
- If you see the answer, return complete immediately
|
| 133 |
+
- Use **bold** for prices, numbers, names
|
| 134 |
+
- Be efficient - don't repeat searches
|
| 135 |
+
|
| 136 |
+
Return ONLY valid JSON:"""
|
| 137 |
+
|
| 138 |
+
try:
|
| 139 |
+
response = await generate_completion(
|
| 140 |
+
messages=[{"role": "user", "content": prompt}],
|
| 141 |
+
max_tokens=800
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# Parse JSON
|
| 145 |
+
response = response.strip()
|
| 146 |
+
if response.startswith("```"):
|
| 147 |
+
response = response.split("```")[1]
|
| 148 |
+
if response.startswith("json"):
|
| 149 |
+
response = response[4:]
|
| 150 |
+
|
| 151 |
+
decision = json.loads(response)
|
| 152 |
+
action = decision.get("action", "search")
|
| 153 |
+
|
| 154 |
+
# Safety check: prevent navigating to already visited URL
|
| 155 |
+
if action == "navigate":
|
| 156 |
+
url = decision.get("url", "").rstrip("/")
|
| 157 |
+
|
| 158 |
+
# Check if URL already visited (normalize by removing trailing slash)
|
| 159 |
+
visited_normalized = [u.rstrip("/") for u in state.visited_urls]
|
| 160 |
+
if url in visited_normalized or url in state.visited_urls:
|
| 161 |
+
logger.warning(f"LLM tried to revisit {url}, trying different approach")
|
| 162 |
+
|
| 163 |
+
# If we have good content, finish
|
| 164 |
+
good_content = [c for c in state.content_cache.values()
|
| 165 |
+
if c and c not in ["[BLOCKED]", "[LOGIN_REQUIRED]"]]
|
| 166 |
+
if good_content:
|
| 167 |
+
return "complete", {"result": f"Informação coletada: {state.get_recent_content()[:800]}"}
|
| 168 |
+
|
| 169 |
+
# Otherwise, search with different terms
|
| 170 |
+
return "search", {"query": f"{state.task} site:wikipedia.org OR site:gov.br"}
|
| 171 |
+
|
| 172 |
+
logger.info(f"ThinkAndAct decision: {action}")
|
| 173 |
+
return action, decision
|
| 174 |
+
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.error(f"ThinkAndAct failed: {e}")
|
| 177 |
+
# Fallback: if we have content, try to respond
|
| 178 |
+
if state.content_cache:
|
| 179 |
+
return "complete", {"result": f"Based on collected data: {state.get_recent_content()[:500]}"}
|
| 180 |
+
return "search", {"query": state.task}
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
async def execute_action(state: SimpleState, action: str, params: dict) -> bool:
|
| 184 |
+
"""
|
| 185 |
+
Execute action WITHOUT LLM call.
|
| 186 |
+
Uses cache to avoid repeated requests.
|
| 187 |
+
Returns True if should continue, False if done.
|
| 188 |
+
"""
|
| 189 |
+
desktop = state.desktop
|
| 190 |
+
|
| 191 |
+
if action == "complete":
|
| 192 |
+
state.final_result = params.get("result", "")
|
| 193 |
+
state.done = True
|
| 194 |
+
return False
|
| 195 |
+
|
| 196 |
+
elif action == "search":
|
| 197 |
+
query = params.get("query", state.task)
|
| 198 |
+
search_url = f"https://html.duckduckgo.com/html/?q={query.replace(' ', '+')}"
|
| 199 |
+
|
| 200 |
+
# Check cache first
|
| 201 |
+
cached = state.get_cached_content(search_url)
|
| 202 |
+
if cached:
|
| 203 |
+
logger.info(f"Using cached content for search: {query[:30]}")
|
| 204 |
+
state.action_history.append(f"search(cached):{query[:30]}")
|
| 205 |
+
return True
|
| 206 |
+
|
| 207 |
+
desktop.commands.run(f"google-chrome {shlex.quote(search_url)} &", background=True)
|
| 208 |
+
desktop.wait(3000)
|
| 209 |
+
|
| 210 |
+
content = await _extract_content(desktop, search_url)
|
| 211 |
+
state.add_page(search_url, content)
|
| 212 |
+
state.action_history.append(f"search:{query[:30]}")
|
| 213 |
+
|
| 214 |
+
return True
|
| 215 |
+
|
| 216 |
+
elif action == "navigate":
|
| 217 |
+
url = params.get("url", "")
|
| 218 |
+
if not url.startswith("http"):
|
| 219 |
+
return True # Invalid URL, continue
|
| 220 |
+
|
| 221 |
+
# Check cache first - don't re-fetch
|
| 222 |
+
cached = state.get_cached_content(url)
|
| 223 |
+
if cached:
|
| 224 |
+
logger.info(f"Using cached content for: {url[:50]}")
|
| 225 |
+
state.action_history.append(f"nav(cached):{url[:30]}")
|
| 226 |
+
return True
|
| 227 |
+
|
| 228 |
+
desktop.commands.run(f"google-chrome {shlex.quote(url)} &", background=True)
|
| 229 |
+
desktop.wait(3000)
|
| 230 |
+
|
| 231 |
+
content = await _extract_content(desktop, url)
|
| 232 |
+
|
| 233 |
+
# Check for Cloudflare/bot detection - just skip if blocked
|
| 234 |
+
from app.agents.flaresolverr import is_cloudflare_blocked, is_login_wall
|
| 235 |
+
|
| 236 |
+
if is_cloudflare_blocked(content):
|
| 237 |
+
logger.warning(f"Cloudflare block detected at {url[:50]}, skipping...")
|
| 238 |
+
# Mark as visited so LLM doesn't try again
|
| 239 |
+
if url not in state.visited_urls:
|
| 240 |
+
state.visited_urls.append(url)
|
| 241 |
+
state.content_cache[url] = "[BLOCKED]" # Mark as blocked in cache
|
| 242 |
+
state.action_history.append(f"nav(blocked):{url[:30]}")
|
| 243 |
+
return True
|
| 244 |
+
|
| 245 |
+
if is_login_wall(content):
|
| 246 |
+
logger.warning(f"Login wall detected at {url[:50]}, skipping...")
|
| 247 |
+
# Mark as visited so LLM doesn't try again
|
| 248 |
+
if url not in state.visited_urls:
|
| 249 |
+
state.visited_urls.append(url)
|
| 250 |
+
state.content_cache[url] = "[LOGIN_REQUIRED]" # Mark in cache
|
| 251 |
+
state.action_history.append(f"nav(login_wall):{url[:30]}")
|
| 252 |
+
return True
|
| 253 |
+
|
| 254 |
+
state.add_page(url, content)
|
| 255 |
+
state.action_history.append(f"nav:{url[:30]}")
|
| 256 |
+
|
| 257 |
+
return True
|
| 258 |
+
|
| 259 |
+
elif action == "scroll":
|
| 260 |
+
desktop.scroll(-3)
|
| 261 |
+
desktop.wait(1500)
|
| 262 |
+
|
| 263 |
+
# Update cache for current page with new content
|
| 264 |
+
if state.visited_urls:
|
| 265 |
+
current_url = state.visited_urls[-1]
|
| 266 |
+
content = await _extract_content(desktop, current_url)
|
| 267 |
+
state.content_cache[current_url] = content[:4000] # Update cache
|
| 268 |
+
|
| 269 |
+
state.action_history.append("scroll")
|
| 270 |
+
return True
|
| 271 |
+
|
| 272 |
+
return True
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
async def _extract_content(desktop, url: str) -> str:
|
| 276 |
+
"""Extract page content via curl."""
|
| 277 |
+
try:
|
| 278 |
+
result = desktop.commands.run(
|
| 279 |
+
f"curl -sL --max-time 8 --connect-timeout 5 "
|
| 280 |
+
f"-A 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' "
|
| 281 |
+
f"'{url}' 2>/dev/null | "
|
| 282 |
+
"sed -e 's/<script[^>]*>.*<\\/script>//g' -e 's/<style[^>]*>.*<\\/style>//g' | "
|
| 283 |
+
"sed 's/<[^>]*>//g' | "
|
| 284 |
+
"tr -s ' \\n' ' ' | "
|
| 285 |
+
"head -c 6000",
|
| 286 |
+
timeout=12
|
| 287 |
+
)
|
| 288 |
+
return result.stdout.strip() if hasattr(result, 'stdout') else ""
|
| 289 |
+
except Exception as e:
|
| 290 |
+
logger.warning(f"Extract failed: {e}")
|
| 291 |
+
return ""
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
async def generate_final_response(state: SimpleState) -> str:
|
| 295 |
+
"""Generate response if agent timed out without completing."""
|
| 296 |
+
if state.final_result:
|
| 297 |
+
return state.final_result
|
| 298 |
+
|
| 299 |
+
content = state.get_recent_content()
|
| 300 |
+
|
| 301 |
+
prompt = f"""Based on the research done, answer the question.
|
| 302 |
+
|
| 303 |
+
TASK: {state.task}
|
| 304 |
+
|
| 305 |
+
COLLECTED DATA:
|
| 306 |
+
{content if content else "(No data collected)"}
|
| 307 |
+
|
| 308 |
+
SOURCES VISITED: {', '.join(state.visited_urls[:5]) if state.visited_urls else 'None'}
|
| 309 |
+
|
| 310 |
+
Provide a helpful answer based on what was found. Use **bold** for important values. If you couldn't find the answer, say so honestly.
|
| 311 |
+
|
| 312 |
+
Answer in Portuguese:"""
|
| 313 |
+
|
| 314 |
+
try:
|
| 315 |
+
response = await generate_completion(
|
| 316 |
+
messages=[{"role": "user", "content": prompt}],
|
| 317 |
+
max_tokens=1000
|
| 318 |
+
)
|
| 319 |
+
return response.strip()
|
| 320 |
+
except Exception as e:
|
| 321 |
+
return f"Não foi possível completar a pesquisa. Erro: {e}"
|
app/agents/graph/state.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agent state management for graph-based execution.
|
| 2 |
+
|
| 3 |
+
The state is passed between nodes and accumulates information
|
| 4 |
+
throughout the agent's execution.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from typing import Optional, Any
|
| 9 |
+
from enum import Enum
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class NodeType(Enum):
|
| 13 |
+
"""Types of nodes in the agent graph."""
|
| 14 |
+
START = "start"
|
| 15 |
+
PLAN = "plan"
|
| 16 |
+
SEARCH = "search"
|
| 17 |
+
NAVIGATE = "navigate"
|
| 18 |
+
EXTRACT = "extract"
|
| 19 |
+
VERIFY = "verify"
|
| 20 |
+
RESPOND = "respond"
|
| 21 |
+
ERROR = "error"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class AgentState:
|
| 26 |
+
"""Shared state passed between graph nodes."""
|
| 27 |
+
|
| 28 |
+
# Task info
|
| 29 |
+
task: str = ""
|
| 30 |
+
url: Optional[str] = None
|
| 31 |
+
|
| 32 |
+
# Planning
|
| 33 |
+
plan: dict = field(default_factory=dict)
|
| 34 |
+
current_subtask: int = 0
|
| 35 |
+
|
| 36 |
+
# Execution
|
| 37 |
+
current_node: NodeType = NodeType.START
|
| 38 |
+
step_count: int = 0
|
| 39 |
+
start_time: float = field(default_factory=lambda: 0.0)
|
| 40 |
+
timeout_seconds: float = 300.0 # 5 minutes default
|
| 41 |
+
|
| 42 |
+
# Memory
|
| 43 |
+
visited_urls: list = field(default_factory=list)
|
| 44 |
+
extracted_data: list = field(default_factory=list)
|
| 45 |
+
page_content: str = ""
|
| 46 |
+
window_title: str = ""
|
| 47 |
+
|
| 48 |
+
# History
|
| 49 |
+
action_history: list = field(default_factory=list)
|
| 50 |
+
error_history: list = field(default_factory=list)
|
| 51 |
+
|
| 52 |
+
# Results
|
| 53 |
+
final_result: str = ""
|
| 54 |
+
success: bool = False
|
| 55 |
+
|
| 56 |
+
# Desktop reference (set at runtime)
|
| 57 |
+
desktop: Any = None
|
| 58 |
+
|
| 59 |
+
def add_action(self, action: dict):
|
| 60 |
+
"""Add action to history."""
|
| 61 |
+
self.action_history.append({
|
| 62 |
+
"step": self.step_count,
|
| 63 |
+
"node": self.current_node.value,
|
| 64 |
+
"action": action
|
| 65 |
+
})
|
| 66 |
+
|
| 67 |
+
def add_error(self, error: str):
|
| 68 |
+
"""Add error to history."""
|
| 69 |
+
self.error_history.append({
|
| 70 |
+
"step": self.step_count,
|
| 71 |
+
"error": error
|
| 72 |
+
})
|
| 73 |
+
|
| 74 |
+
def add_extracted_data(self, source: str, data: dict):
|
| 75 |
+
"""Add extracted data from a source."""
|
| 76 |
+
self.extracted_data.append({
|
| 77 |
+
"source": source,
|
| 78 |
+
"url": self.visited_urls[-1] if self.visited_urls else "",
|
| 79 |
+
"data": data
|
| 80 |
+
})
|
| 81 |
+
|
| 82 |
+
def get_context_for_llm(self) -> str:
|
| 83 |
+
"""Get formatted context for LLM prompts."""
|
| 84 |
+
context_parts = []
|
| 85 |
+
|
| 86 |
+
if self.action_history:
|
| 87 |
+
recent = self.action_history[-5:]
|
| 88 |
+
context_parts.append("Recent actions:")
|
| 89 |
+
for h in recent:
|
| 90 |
+
context_parts.append(f" - {h['node']}: {h['action']}")
|
| 91 |
+
|
| 92 |
+
if self.extracted_data:
|
| 93 |
+
context_parts.append("\nExtracted data:")
|
| 94 |
+
for d in self.extracted_data:
|
| 95 |
+
context_parts.append(f" - {d['source']}: {d['data']}")
|
| 96 |
+
|
| 97 |
+
if self.error_history:
|
| 98 |
+
context_parts.append("\nErrors encountered:")
|
| 99 |
+
for e in self.error_history[-3:]:
|
| 100 |
+
context_parts.append(f" - {e['error']}")
|
| 101 |
+
|
| 102 |
+
return "\n".join(context_parts)
|
| 103 |
+
|
| 104 |
+
def should_continue(self) -> bool:
|
| 105 |
+
"""Check if agent should continue execution based on timeout."""
|
| 106 |
+
import time
|
| 107 |
+
if self.start_time == 0:
|
| 108 |
+
self.start_time = time.time()
|
| 109 |
+
|
| 110 |
+
elapsed = time.time() - self.start_time
|
| 111 |
+
time_ok = elapsed < self.timeout_seconds
|
| 112 |
+
|
| 113 |
+
return (
|
| 114 |
+
not self.success and
|
| 115 |
+
time_ok and
|
| 116 |
+
self.current_node != NodeType.ERROR
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
def get_elapsed_time(self) -> float:
|
| 120 |
+
"""Get elapsed time in seconds."""
|
| 121 |
+
import time
|
| 122 |
+
if self.start_time == 0:
|
| 123 |
+
return 0.0
|
| 124 |
+
return time.time() - self.start_time
|
| 125 |
+
|
| 126 |
+
def get_remaining_time(self) -> float:
|
| 127 |
+
"""Get remaining time in seconds."""
|
| 128 |
+
return max(0, self.timeout_seconds - self.get_elapsed_time())
|
app/agents/heavy_search.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Heavy Search Agent.
|
| 2 |
+
|
| 3 |
+
Middle-ground between Quick Search and Deep Research.
|
| 4 |
+
Scrapes full content from top results for richer answers.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import time
|
| 9 |
+
from typing import AsyncIterator
|
| 10 |
+
|
| 11 |
+
from app.agents.llm_client import generate_completion_stream
|
| 12 |
+
from app.sources.aggregator import aggregate_search
|
| 13 |
+
from app.sources.scraper import scrape_multiple_urls
|
| 14 |
+
from app.reranking.pipeline import rerank_results
|
| 15 |
+
from app.temporal.intent_detector import detect_temporal_intent
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
async def run_heavy_search(
|
| 19 |
+
query: str,
|
| 20 |
+
max_results: int = 15,
|
| 21 |
+
max_scrape: int = 8,
|
| 22 |
+
freshness: str = "any",
|
| 23 |
+
) -> AsyncIterator[str]:
|
| 24 |
+
"""
|
| 25 |
+
Run heavy search with content scraping.
|
| 26 |
+
|
| 27 |
+
Steps:
|
| 28 |
+
1. Aggregate search from multiple sources
|
| 29 |
+
2. Rerank results
|
| 30 |
+
3. Scrape full content from top N results
|
| 31 |
+
4. Stream synthesized answer
|
| 32 |
+
|
| 33 |
+
Yields:
|
| 34 |
+
SSE event strings
|
| 35 |
+
"""
|
| 36 |
+
start_time = time.perf_counter()
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
# Step 1: Status
|
| 40 |
+
yield _sse_event("status", {"phase": "searching", "message": "Searching multiple sources..."})
|
| 41 |
+
|
| 42 |
+
# Step 2: Aggregate search
|
| 43 |
+
temporal_intent, temporal_urgency = detect_temporal_intent(query)
|
| 44 |
+
|
| 45 |
+
raw_results = await aggregate_search(
|
| 46 |
+
query=query,
|
| 47 |
+
max_results=max_results + 5,
|
| 48 |
+
freshness=freshness,
|
| 49 |
+
include_wikipedia=True,
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
if not raw_results:
|
| 53 |
+
yield _sse_event("error", {"message": "No results found"})
|
| 54 |
+
return
|
| 55 |
+
|
| 56 |
+
yield _sse_event("search_complete", {
|
| 57 |
+
"results_count": len(raw_results),
|
| 58 |
+
"sources": list(set(r.get("source", "unknown") for r in raw_results)),
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
# Step 3: Rerank (use embeddings when we have many results from SearXNG)
|
| 62 |
+
yield _sse_event("status", {"phase": "ranking", "message": "Ranking results..."})
|
| 63 |
+
|
| 64 |
+
# Enable embeddings when we have many results (SearXNG provides volume)
|
| 65 |
+
use_embeddings = len(raw_results) > 20
|
| 66 |
+
|
| 67 |
+
ranked_results = await rerank_results(
|
| 68 |
+
query=query,
|
| 69 |
+
results=raw_results,
|
| 70 |
+
temporal_urgency=temporal_urgency,
|
| 71 |
+
max_results=max_results,
|
| 72 |
+
use_embeddings=use_embeddings,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# Step 4: Scrape top results
|
| 76 |
+
yield _sse_event("status", {"phase": "scraping", "message": f"Reading top {max_scrape} sources..."})
|
| 77 |
+
|
| 78 |
+
urls_to_scrape = [r.get("url") for r in ranked_results[:max_scrape] if r.get("url")]
|
| 79 |
+
scraped_content = await scrape_multiple_urls(
|
| 80 |
+
urls=urls_to_scrape,
|
| 81 |
+
max_chars_per_url=4000,
|
| 82 |
+
max_concurrent=3,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# Merge scraped content into results
|
| 86 |
+
for result in ranked_results:
|
| 87 |
+
url = result.get("url", "")
|
| 88 |
+
if url in scraped_content and scraped_content[url]:
|
| 89 |
+
result["full_content"] = scraped_content[url]
|
| 90 |
+
result["scraped"] = True
|
| 91 |
+
else:
|
| 92 |
+
result["full_content"] = result.get("content", "")
|
| 93 |
+
result["scraped"] = False
|
| 94 |
+
|
| 95 |
+
scraped_count = sum(1 for r in ranked_results if r.get("scraped"))
|
| 96 |
+
yield _sse_event("scrape_complete", {
|
| 97 |
+
"scraped_count": scraped_count,
|
| 98 |
+
"total": len(urls_to_scrape),
|
| 99 |
+
})
|
| 100 |
+
|
| 101 |
+
# Step 5: Send results
|
| 102 |
+
yield _sse_event("results", {
|
| 103 |
+
"results": [
|
| 104 |
+
{
|
| 105 |
+
"title": r.get("title", ""),
|
| 106 |
+
"url": r.get("url", ""),
|
| 107 |
+
"score": r.get("score", 0),
|
| 108 |
+
"source": r.get("source", ""),
|
| 109 |
+
"scraped": r.get("scraped", False),
|
| 110 |
+
}
|
| 111 |
+
for r in ranked_results
|
| 112 |
+
],
|
| 113 |
+
"temporal_intent": temporal_intent,
|
| 114 |
+
"temporal_urgency": temporal_urgency,
|
| 115 |
+
})
|
| 116 |
+
|
| 117 |
+
# Step 6: Synthesize answer
|
| 118 |
+
yield _sse_event("status", {"phase": "synthesizing", "message": "Generating answer..."})
|
| 119 |
+
yield _sse_event("answer_start", {})
|
| 120 |
+
|
| 121 |
+
async for chunk in _synthesize_heavy_answer(query, ranked_results, temporal_intent):
|
| 122 |
+
yield _sse_event("answer_chunk", {"content": chunk})
|
| 123 |
+
|
| 124 |
+
# Done
|
| 125 |
+
total_time = time.perf_counter() - start_time
|
| 126 |
+
yield _sse_event("done", {
|
| 127 |
+
"total_sources": len(ranked_results),
|
| 128 |
+
"scraped_sources": scraped_count,
|
| 129 |
+
"total_time_seconds": round(total_time, 2),
|
| 130 |
+
})
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
yield _sse_event("error", {"message": str(e)})
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
async def _synthesize_heavy_answer(
|
| 137 |
+
query: str,
|
| 138 |
+
results: list[dict],
|
| 139 |
+
temporal_intent: str,
|
| 140 |
+
) -> AsyncIterator[str]:
|
| 141 |
+
"""Synthesize answer from scraped content."""
|
| 142 |
+
|
| 143 |
+
# Build context with full content
|
| 144 |
+
context_parts = []
|
| 145 |
+
for i, r in enumerate(results[:8], 1):
|
| 146 |
+
content = r.get("full_content", r.get("content", ""))[:3000]
|
| 147 |
+
scraped_tag = "[FULL]" if r.get("scraped") else "[SNIPPET]"
|
| 148 |
+
|
| 149 |
+
context_parts.append(
|
| 150 |
+
f"[{i}] {r.get('title', 'Untitled')} {scraped_tag}\n"
|
| 151 |
+
f"URL: {r.get('url', '')}\n"
|
| 152 |
+
f"Content:\n{content}\n"
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
context = "\n---\n".join(context_parts)
|
| 156 |
+
|
| 157 |
+
prompt = f"""You are a research assistant providing comprehensive answers.
|
| 158 |
+
|
| 159 |
+
QUERY: {query}
|
| 160 |
+
TEMPORAL INTENT: {temporal_intent}
|
| 161 |
+
|
| 162 |
+
SOURCES (some with full content [FULL], some with snippets [SNIPPET]):
|
| 163 |
+
{context}
|
| 164 |
+
|
| 165 |
+
INSTRUCTIONS:
|
| 166 |
+
1. Provide a comprehensive, well-structured answer
|
| 167 |
+
2. Use information from [FULL] sources more extensively
|
| 168 |
+
3. Cite sources using [1], [2], etc.
|
| 169 |
+
4. Write in the same language as the query
|
| 170 |
+
5. Be thorough but clear
|
| 171 |
+
|
| 172 |
+
Answer:"""
|
| 173 |
+
|
| 174 |
+
messages = [
|
| 175 |
+
{"role": "system", "content": "You are a helpful research assistant."},
|
| 176 |
+
{"role": "user", "content": prompt},
|
| 177 |
+
]
|
| 178 |
+
|
| 179 |
+
async for chunk in generate_completion_stream(messages, temperature=0.3):
|
| 180 |
+
yield chunk
|
| 181 |
+
|
| 182 |
+
# Add citations
|
| 183 |
+
yield "\n\n---\n**Sources:**\n"
|
| 184 |
+
for i, r in enumerate(results[:8], 1):
|
| 185 |
+
scraped = "📄" if r.get("scraped") else "📋"
|
| 186 |
+
yield f"{scraped} [{i}] [{r.get('title', 'Untitled')}]({r.get('url', '')})\n"
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def _sse_event(event_type: str, data: dict) -> str:
|
| 190 |
+
"""Format an SSE event."""
|
| 191 |
+
payload = {"type": event_type, **data}
|
| 192 |
+
return f"data: {json.dumps(payload)}\n\n"
|
app/agents/llm_client.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LLM client abstraction for multiple providers.
|
| 2 |
+
|
| 3 |
+
Supports Groq and OpenRouter for LLM inference.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import httpx
|
| 7 |
+
import json
|
| 8 |
+
from typing import Optional, AsyncIterator
|
| 9 |
+
import asyncio
|
| 10 |
+
|
| 11 |
+
from tenacity import (
|
| 12 |
+
retry,
|
| 13 |
+
stop_after_attempt,
|
| 14 |
+
wait_exponential,
|
| 15 |
+
retry_if_exception_type,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
from app.config import get_settings
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class RetryableError(Exception):
|
| 22 |
+
"""Error that should trigger a retry."""
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
async def generate_completion(
|
| 27 |
+
messages: list[dict],
|
| 28 |
+
model: Optional[str] = None,
|
| 29 |
+
temperature: float = 0.3,
|
| 30 |
+
max_tokens: int = 2048,
|
| 31 |
+
) -> str:
|
| 32 |
+
"""Generate a completion using the configured LLM provider."""
|
| 33 |
+
settings = get_settings()
|
| 34 |
+
provider = settings.llm_provider
|
| 35 |
+
model = model or settings.llm_model
|
| 36 |
+
|
| 37 |
+
if provider == "groq":
|
| 38 |
+
return await _call_groq(messages, model, temperature, max_tokens)
|
| 39 |
+
elif provider == "openrouter":
|
| 40 |
+
return await _call_openrouter(messages, model, temperature, max_tokens)
|
| 41 |
+
else:
|
| 42 |
+
raise ValueError(f"Unknown LLM provider: {provider}")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@retry(
|
| 46 |
+
stop=stop_after_attempt(3),
|
| 47 |
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 48 |
+
retry=retry_if_exception_type(RetryableError),
|
| 49 |
+
reraise=True,
|
| 50 |
+
)
|
| 51 |
+
async def _call_groq(
|
| 52 |
+
messages: list[dict],
|
| 53 |
+
model: str,
|
| 54 |
+
temperature: float,
|
| 55 |
+
max_tokens: int,
|
| 56 |
+
) -> str:
|
| 57 |
+
"""Call Groq API with retry logic."""
|
| 58 |
+
settings = get_settings()
|
| 59 |
+
|
| 60 |
+
if not settings.groq_api_key:
|
| 61 |
+
raise ValueError("GROQ_API_KEY not configured")
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
| 65 |
+
response = await client.post(
|
| 66 |
+
"https://api.groq.com/openai/v1/chat/completions",
|
| 67 |
+
headers={
|
| 68 |
+
"Authorization": f"Bearer {settings.groq_api_key}",
|
| 69 |
+
"Content-Type": "application/json",
|
| 70 |
+
},
|
| 71 |
+
json={
|
| 72 |
+
"model": model,
|
| 73 |
+
"messages": messages,
|
| 74 |
+
"temperature": temperature,
|
| 75 |
+
"max_tokens": max_tokens,
|
| 76 |
+
},
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# Retry on rate limit or server errors
|
| 80 |
+
if response.status_code in (429, 502, 503, 504):
|
| 81 |
+
raise RetryableError(f"Groq error {response.status_code}")
|
| 82 |
+
|
| 83 |
+
response.raise_for_status()
|
| 84 |
+
data = response.json()
|
| 85 |
+
|
| 86 |
+
return data["choices"][0]["message"]["content"]
|
| 87 |
+
except httpx.TimeoutException as e:
|
| 88 |
+
raise RetryableError(f"Groq timeout: {e}")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
@retry(
|
| 92 |
+
stop=stop_after_attempt(3),
|
| 93 |
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 94 |
+
retry=retry_if_exception_type(RetryableError),
|
| 95 |
+
reraise=True,
|
| 96 |
+
)
|
| 97 |
+
async def _call_openrouter(
|
| 98 |
+
messages: list[dict],
|
| 99 |
+
model: str,
|
| 100 |
+
temperature: float,
|
| 101 |
+
max_tokens: int,
|
| 102 |
+
) -> str:
|
| 103 |
+
"""Call OpenRouter API with retry logic."""
|
| 104 |
+
settings = get_settings()
|
| 105 |
+
|
| 106 |
+
if not settings.openrouter_api_key:
|
| 107 |
+
raise ValueError("OPENROUTER_API_KEY not configured")
|
| 108 |
+
|
| 109 |
+
headers = {
|
| 110 |
+
"Authorization": f"Bearer {settings.openrouter_api_key}",
|
| 111 |
+
"Content-Type": "application/json",
|
| 112 |
+
"HTTP-Referer": "https://madras1-lancer.hf.space",
|
| 113 |
+
"X-Title": "Lancer Search API",
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
payload = {
|
| 117 |
+
"model": model,
|
| 118 |
+
"messages": messages,
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
| 123 |
+
response = await client.post(
|
| 124 |
+
"https://openrouter.ai/api/v1/chat/completions",
|
| 125 |
+
headers=headers,
|
| 126 |
+
content=json.dumps(payload),
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# Retry on rate limit or server errors
|
| 130 |
+
if response.status_code in (429, 502, 503, 504):
|
| 131 |
+
raise RetryableError(f"OpenRouter error {response.status_code}")
|
| 132 |
+
|
| 133 |
+
if response.status_code != 200:
|
| 134 |
+
error_text = response.text
|
| 135 |
+
raise ValueError(f"OpenRouter error {response.status_code}: {error_text}")
|
| 136 |
+
|
| 137 |
+
data = response.json()
|
| 138 |
+
return data["choices"][0]["message"]["content"]
|
| 139 |
+
except httpx.TimeoutException as e:
|
| 140 |
+
raise RetryableError(f"OpenRouter timeout: {e}")
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
async def generate_completion_stream(
|
| 144 |
+
messages: list[dict],
|
| 145 |
+
model: Optional[str] = None,
|
| 146 |
+
temperature: float = 0.3,
|
| 147 |
+
max_tokens: int = 2048,
|
| 148 |
+
) -> AsyncIterator[str]:
|
| 149 |
+
"""Generate a streaming completion using OpenRouter."""
|
| 150 |
+
settings = get_settings()
|
| 151 |
+
model = model or settings.llm_model
|
| 152 |
+
|
| 153 |
+
if not settings.openrouter_api_key:
|
| 154 |
+
raise ValueError("OPENROUTER_API_KEY not configured")
|
| 155 |
+
|
| 156 |
+
headers = {
|
| 157 |
+
"Authorization": f"Bearer {settings.openrouter_api_key}",
|
| 158 |
+
"Content-Type": "application/json",
|
| 159 |
+
"HTTP-Referer": "https://madras1-lancer.hf.space",
|
| 160 |
+
"X-Title": "Lancer Search API",
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
payload = {
|
| 164 |
+
"model": model,
|
| 165 |
+
"messages": messages,
|
| 166 |
+
"stream": True,
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
async with httpx.AsyncClient(timeout=120.0) as client:
|
| 170 |
+
async with client.stream(
|
| 171 |
+
"POST",
|
| 172 |
+
"https://openrouter.ai/api/v1/chat/completions",
|
| 173 |
+
headers=headers,
|
| 174 |
+
content=json.dumps(payload),
|
| 175 |
+
) as response:
|
| 176 |
+
if response.status_code != 200:
|
| 177 |
+
error_text = await response.aread()
|
| 178 |
+
raise ValueError(f"OpenRouter streaming error {response.status_code}: {error_text}")
|
| 179 |
+
|
| 180 |
+
async for line in response.aiter_lines():
|
| 181 |
+
if line.startswith("data: "):
|
| 182 |
+
data_str = line[6:]
|
| 183 |
+
if data_str.strip() == "[DONE]":
|
| 184 |
+
break
|
| 185 |
+
try:
|
| 186 |
+
data = json.loads(data_str)
|
| 187 |
+
delta = data.get("choices", [{}])[0].get("delta", {})
|
| 188 |
+
content = delta.get("content", "")
|
| 189 |
+
if content:
|
| 190 |
+
yield content
|
| 191 |
+
except json.JSONDecodeError:
|
| 192 |
+
continue
|
app/agents/planner.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Research Planner Agent.
|
| 2 |
+
|
| 3 |
+
Decomposes complex queries into multiple research dimensions.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
from pydantic import BaseModel, Field
|
| 10 |
+
|
| 11 |
+
from app.agents.llm_client import generate_completion
|
| 12 |
+
from app.config import get_settings
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ResearchDimension(BaseModel):
|
| 16 |
+
"""A single dimension/aspect to research."""
|
| 17 |
+
|
| 18 |
+
name: str = Field(..., description="Short name for this dimension")
|
| 19 |
+
description: str = Field(..., description="What this dimension covers")
|
| 20 |
+
search_query: str = Field(..., description="Optimized search query for this dimension")
|
| 21 |
+
priority: int = Field(default=1, ge=1, le=3, description="1=high, 2=medium, 3=low")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class ResearchPlan(BaseModel):
|
| 25 |
+
"""Complete research plan with all dimensions."""
|
| 26 |
+
|
| 27 |
+
original_query: str
|
| 28 |
+
refined_query: str = Field(..., description="Clarified version of the query")
|
| 29 |
+
dimensions: list[ResearchDimension]
|
| 30 |
+
estimated_sources: int = Field(default=20)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
PLANNER_PROMPT = """You are a research planning assistant. Your job is to decompose a complex query into multiple research dimensions.
|
| 34 |
+
|
| 35 |
+
USER QUERY: {query}
|
| 36 |
+
|
| 37 |
+
INSTRUCTIONS:
|
| 38 |
+
1. Analyze the query and identify 2-6 key dimensions/aspects that need to be researched
|
| 39 |
+
2. Each dimension should be distinct and cover a different angle
|
| 40 |
+
3. Create an optimized search query for each dimension
|
| 41 |
+
4. Assign priority (1=high, 2=medium, 3=low) based on relevance to the main query
|
| 42 |
+
5. Respond ONLY with valid JSON, no other text
|
| 43 |
+
|
| 44 |
+
OUTPUT FORMAT:
|
| 45 |
+
{{
|
| 46 |
+
"refined_query": "A clearer version of the user's query",
|
| 47 |
+
"dimensions": [
|
| 48 |
+
{{
|
| 49 |
+
"name": "Short name",
|
| 50 |
+
"description": "What this covers",
|
| 51 |
+
"search_query": "Optimized search query",
|
| 52 |
+
"priority": 1
|
| 53 |
+
}}
|
| 54 |
+
]
|
| 55 |
+
}}
|
| 56 |
+
|
| 57 |
+
Generate the research plan:"""
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
async def create_research_plan(
|
| 61 |
+
query: str,
|
| 62 |
+
max_dimensions: int = 6,
|
| 63 |
+
) -> ResearchPlan:
|
| 64 |
+
"""
|
| 65 |
+
Create a research plan by decomposing a query into dimensions.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
query: The user's research query
|
| 69 |
+
max_dimensions: Maximum number of dimensions to generate
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
ResearchPlan with dimensions to investigate
|
| 73 |
+
"""
|
| 74 |
+
settings = get_settings()
|
| 75 |
+
|
| 76 |
+
messages = [
|
| 77 |
+
{"role": "system", "content": "You are a research planning assistant. Always respond with valid JSON only."},
|
| 78 |
+
{"role": "user", "content": PLANNER_PROMPT.format(query=query)},
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
response = await generate_completion(messages, temperature=0.3)
|
| 83 |
+
|
| 84 |
+
# Parse JSON response
|
| 85 |
+
# Try to extract JSON if there's extra text
|
| 86 |
+
json_start = response.find("{")
|
| 87 |
+
json_end = response.rfind("}") + 1
|
| 88 |
+
if json_start >= 0 and json_end > json_start:
|
| 89 |
+
response = response[json_start:json_end]
|
| 90 |
+
|
| 91 |
+
data = json.loads(response)
|
| 92 |
+
|
| 93 |
+
# Build dimensions
|
| 94 |
+
dimensions = []
|
| 95 |
+
for dim_data in data.get("dimensions", [])[:max_dimensions]:
|
| 96 |
+
dimensions.append(ResearchDimension(
|
| 97 |
+
name=dim_data.get("name", "Unknown"),
|
| 98 |
+
description=dim_data.get("description", ""),
|
| 99 |
+
search_query=dim_data.get("search_query", query),
|
| 100 |
+
priority=dim_data.get("priority", 2),
|
| 101 |
+
))
|
| 102 |
+
|
| 103 |
+
# Sort by priority
|
| 104 |
+
dimensions.sort(key=lambda d: d.priority)
|
| 105 |
+
|
| 106 |
+
return ResearchPlan(
|
| 107 |
+
original_query=query,
|
| 108 |
+
refined_query=data.get("refined_query", query),
|
| 109 |
+
dimensions=dimensions,
|
| 110 |
+
estimated_sources=len(dimensions) * 5,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
except (json.JSONDecodeError, KeyError) as e:
|
| 114 |
+
# Fallback: create a simple 2-dimension plan
|
| 115 |
+
return ResearchPlan(
|
| 116 |
+
original_query=query,
|
| 117 |
+
refined_query=query,
|
| 118 |
+
dimensions=[
|
| 119 |
+
ResearchDimension(
|
| 120 |
+
name="Main Research",
|
| 121 |
+
description=f"Primary research on: {query}",
|
| 122 |
+
search_query=query,
|
| 123 |
+
priority=1,
|
| 124 |
+
),
|
| 125 |
+
ResearchDimension(
|
| 126 |
+
name="Background",
|
| 127 |
+
description=f"Background and context for: {query}",
|
| 128 |
+
search_query=f"{query} background overview",
|
| 129 |
+
priority=2,
|
| 130 |
+
),
|
| 131 |
+
],
|
| 132 |
+
estimated_sources=10,
|
| 133 |
+
)
|
app/agents/synthesizer.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Answer synthesizer agent.
|
| 2 |
+
|
| 3 |
+
Generates a coherent answer from search results with citations.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from typing import Optional, AsyncIterator
|
| 8 |
+
|
| 9 |
+
from app.api.schemas import SearchResult, TemporalContext, Citation
|
| 10 |
+
from app.agents.llm_client import generate_completion, generate_completion_stream
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
SYNTHESIS_PROMPT = """You are a research assistant that synthesizes information from search results.
|
| 14 |
+
|
| 15 |
+
CURRENT DATE: {current_date}
|
| 16 |
+
|
| 17 |
+
USER QUERY: {query}
|
| 18 |
+
|
| 19 |
+
TEMPORAL CONTEXT:
|
| 20 |
+
- Query intent: {temporal_intent} (the user {intent_explanation})
|
| 21 |
+
- Temporal urgency: {temporal_urgency:.0%} (how important freshness is)
|
| 22 |
+
|
| 23 |
+
SEARCH RESULTS:
|
| 24 |
+
{formatted_results}
|
| 25 |
+
|
| 26 |
+
INSTRUCTIONS:
|
| 27 |
+
1. Synthesize a comprehensive answer based on the search results
|
| 28 |
+
2. ALWAYS cite your sources using [1], [2], etc. format
|
| 29 |
+
3. If the query requires current information, prioritize the most recent results
|
| 30 |
+
4. If there are conflicting dates or versions mentioned, use the most recent accurate information
|
| 31 |
+
5. Be concise but thorough
|
| 32 |
+
6. If information seems outdated compared to current date ({current_date}), note this
|
| 33 |
+
7. Write in the same language as the query
|
| 34 |
+
|
| 35 |
+
Generate your answer:"""
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
async def synthesize_answer(
|
| 39 |
+
query: str,
|
| 40 |
+
results: list[SearchResult],
|
| 41 |
+
temporal_context: Optional[TemporalContext] = None,
|
| 42 |
+
) -> tuple[str, list[Citation]]:
|
| 43 |
+
"""
|
| 44 |
+
Synthesize an answer from search results.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
query: Original search query
|
| 48 |
+
results: List of search results to synthesize from
|
| 49 |
+
temporal_context: Temporal analysis context
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
Tuple of (answer_text, citations_list)
|
| 53 |
+
"""
|
| 54 |
+
if not results:
|
| 55 |
+
return "No results found to synthesize an answer.", []
|
| 56 |
+
|
| 57 |
+
messages = _build_messages(query, results, temporal_context)
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
answer = await generate_completion(messages, temperature=0.3)
|
| 61 |
+
except Exception as e:
|
| 62 |
+
# Fallback: return a simple summary without LLM
|
| 63 |
+
answer = f"Error generating synthesis: {e}. Please review the search results directly."
|
| 64 |
+
|
| 65 |
+
# Build citations list
|
| 66 |
+
citations = _build_citations(results)
|
| 67 |
+
|
| 68 |
+
return answer, citations
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
async def synthesize_answer_stream(
|
| 72 |
+
query: str,
|
| 73 |
+
results: list[SearchResult],
|
| 74 |
+
temporal_context: Optional[TemporalContext] = None,
|
| 75 |
+
) -> AsyncIterator[str]:
|
| 76 |
+
"""
|
| 77 |
+
Synthesize an answer with streaming output.
|
| 78 |
+
|
| 79 |
+
Yields chunks of the answer as they are generated.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
query: Original search query
|
| 83 |
+
results: List of search results to synthesize from
|
| 84 |
+
temporal_context: Temporal analysis context
|
| 85 |
+
|
| 86 |
+
Yields:
|
| 87 |
+
Chunks of the answer text
|
| 88 |
+
"""
|
| 89 |
+
if not results:
|
| 90 |
+
yield "No results found to synthesize an answer."
|
| 91 |
+
return
|
| 92 |
+
|
| 93 |
+
messages = _build_messages(query, results, temporal_context)
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
async for chunk in generate_completion_stream(messages, temperature=0.3):
|
| 97 |
+
yield chunk
|
| 98 |
+
except Exception as e:
|
| 99 |
+
yield f"Error generating synthesis: {e}. Please review the search results directly."
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _build_messages(
|
| 103 |
+
query: str,
|
| 104 |
+
results: list[SearchResult],
|
| 105 |
+
temporal_context: Optional[TemporalContext] = None,
|
| 106 |
+
) -> list[dict]:
|
| 107 |
+
"""Build messages for LLM prompt."""
|
| 108 |
+
# Format results for the prompt
|
| 109 |
+
formatted_results = format_results_for_prompt(results[:10]) # Top 10 only
|
| 110 |
+
|
| 111 |
+
# Prepare temporal context
|
| 112 |
+
current_date = datetime.now().strftime("%Y-%m-%d")
|
| 113 |
+
temporal_intent = "neutral"
|
| 114 |
+
temporal_urgency = 0.5
|
| 115 |
+
|
| 116 |
+
if temporal_context:
|
| 117 |
+
temporal_intent = temporal_context.query_temporal_intent
|
| 118 |
+
temporal_urgency = temporal_context.temporal_urgency
|
| 119 |
+
current_date = temporal_context.current_date
|
| 120 |
+
|
| 121 |
+
# Map intent to explanation
|
| 122 |
+
intent_explanations = {
|
| 123 |
+
"current": "is looking for the most recent/current information",
|
| 124 |
+
"historical": "is interested in historical or background information",
|
| 125 |
+
"neutral": "has no specific temporal preference",
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
prompt = SYNTHESIS_PROMPT.format(
|
| 129 |
+
current_date=current_date,
|
| 130 |
+
query=query,
|
| 131 |
+
temporal_intent=temporal_intent,
|
| 132 |
+
intent_explanation=intent_explanations.get(temporal_intent, ""),
|
| 133 |
+
temporal_urgency=temporal_urgency,
|
| 134 |
+
formatted_results=formatted_results,
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
return [
|
| 138 |
+
{"role": "system", "content": "You are a helpful research assistant."},
|
| 139 |
+
{"role": "user", "content": prompt},
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _build_citations(results: list[SearchResult]) -> list[Citation]:
|
| 144 |
+
"""Build citations list from results."""
|
| 145 |
+
citations = []
|
| 146 |
+
for i, result in enumerate(results[:10], 1):
|
| 147 |
+
citations.append(
|
| 148 |
+
Citation(
|
| 149 |
+
index=i,
|
| 150 |
+
url=result.url,
|
| 151 |
+
title=result.title,
|
| 152 |
+
)
|
| 153 |
+
)
|
| 154 |
+
return citations
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def format_results_for_prompt(results: list[SearchResult]) -> str:
|
| 158 |
+
"""Format search results for inclusion in the LLM prompt."""
|
| 159 |
+
formatted = []
|
| 160 |
+
|
| 161 |
+
for i, result in enumerate(results, 1):
|
| 162 |
+
date_str = ""
|
| 163 |
+
if result.published_date:
|
| 164 |
+
date_str = f" (Published: {result.published_date.strftime('%Y-%m-%d')})"
|
| 165 |
+
|
| 166 |
+
formatted.append(
|
| 167 |
+
f"[{i}] {result.title}{date_str}\n"
|
| 168 |
+
f" URL: {result.url}\n"
|
| 169 |
+
f" Freshness: {result.freshness_score:.0%} | Authority: {result.authority_score:.0%}\n"
|
| 170 |
+
f" Content: {result.content[:500]}..."
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
return "\n\n".join(formatted)
|
app/api/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""API routes package."""
|
app/api/routes/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""API routes package."""
|
app/api/routes/search.py
ADDED
|
@@ -0,0 +1,579 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Search API routes."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, HTTPException, Request
|
| 8 |
+
from fastapi.responses import StreamingResponse
|
| 9 |
+
|
| 10 |
+
from app.api.schemas import (
|
| 11 |
+
SearchRequest,
|
| 12 |
+
SearchResponse,
|
| 13 |
+
SearchResult,
|
| 14 |
+
TemporalContext,
|
| 15 |
+
Citation,
|
| 16 |
+
ErrorResponse,
|
| 17 |
+
DeepResearchRequest,
|
| 18 |
+
BrowseRequest,
|
| 19 |
+
)
|
| 20 |
+
from app.config import get_settings
|
| 21 |
+
from app.temporal.intent_detector import detect_temporal_intent
|
| 22 |
+
from app.temporal.freshness_scorer import calculate_freshness_score
|
| 23 |
+
from app.sources.tavily import search_tavily
|
| 24 |
+
from app.sources.duckduckgo import search_duckduckgo
|
| 25 |
+
from app.reranking.pipeline import rerank_results
|
| 26 |
+
from app.agents.synthesizer import synthesize_answer, synthesize_answer_stream
|
| 27 |
+
from app.middleware.rate_limiter import limiter
|
| 28 |
+
|
| 29 |
+
router = APIRouter()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@router.post(
|
| 33 |
+
"/search",
|
| 34 |
+
response_model=SearchResponse,
|
| 35 |
+
responses={500: {"model": ErrorResponse}},
|
| 36 |
+
summary="Search with AI synthesis",
|
| 37 |
+
description="Perform a search with temporal intelligence and return an AI-synthesized answer.",
|
| 38 |
+
)
|
| 39 |
+
@limiter.limit("30/minute")
|
| 40 |
+
async def search(request: Request, body: SearchRequest) -> SearchResponse:
|
| 41 |
+
"""
|
| 42 |
+
Perform an intelligent search with:
|
| 43 |
+
- Temporal intent detection
|
| 44 |
+
- Multi-source search
|
| 45 |
+
- Multi-stage reranking
|
| 46 |
+
- AI-powered answer synthesis
|
| 47 |
+
"""
|
| 48 |
+
start_time = time.perf_counter()
|
| 49 |
+
settings = get_settings()
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
# Step 1: Analyze temporal intent
|
| 53 |
+
temporal_intent, temporal_urgency = detect_temporal_intent(body.query)
|
| 54 |
+
|
| 55 |
+
temporal_context = TemporalContext(
|
| 56 |
+
query_temporal_intent=temporal_intent,
|
| 57 |
+
temporal_urgency=temporal_urgency,
|
| 58 |
+
current_date=datetime.now().strftime("%Y-%m-%d"),
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# Step 2: Search multiple sources
|
| 62 |
+
raw_results = []
|
| 63 |
+
|
| 64 |
+
# Try Tavily first (best quality)
|
| 65 |
+
if settings.tavily_api_key:
|
| 66 |
+
tavily_results = await search_tavily(
|
| 67 |
+
query=body.query,
|
| 68 |
+
max_results=settings.max_search_results,
|
| 69 |
+
freshness=body.freshness,
|
| 70 |
+
include_domains=body.include_domains,
|
| 71 |
+
exclude_domains=body.exclude_domains,
|
| 72 |
+
)
|
| 73 |
+
raw_results.extend(tavily_results)
|
| 74 |
+
|
| 75 |
+
# Fallback to DuckDuckGo if needed
|
| 76 |
+
if not raw_results:
|
| 77 |
+
ddg_results = await search_duckduckgo(
|
| 78 |
+
query=body.query,
|
| 79 |
+
max_results=settings.max_search_results,
|
| 80 |
+
)
|
| 81 |
+
raw_results.extend(ddg_results)
|
| 82 |
+
|
| 83 |
+
if not raw_results:
|
| 84 |
+
return SearchResponse(
|
| 85 |
+
query=body.query,
|
| 86 |
+
answer="No results found for your query.",
|
| 87 |
+
results=[],
|
| 88 |
+
citations=[],
|
| 89 |
+
temporal_context=temporal_context,
|
| 90 |
+
processing_time_ms=(time.perf_counter() - start_time) * 1000,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Step 3: Apply multi-stage reranking
|
| 94 |
+
ranked_results = await rerank_results(
|
| 95 |
+
query=body.query,
|
| 96 |
+
results=raw_results,
|
| 97 |
+
temporal_urgency=temporal_urgency,
|
| 98 |
+
max_results=body.max_results,
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Step 4: Convert to SearchResult models
|
| 102 |
+
search_results = []
|
| 103 |
+
for i, result in enumerate(ranked_results):
|
| 104 |
+
freshness = calculate_freshness_score(result.get("published_date"))
|
| 105 |
+
search_results.append(
|
| 106 |
+
SearchResult(
|
| 107 |
+
title=result.get("title", ""),
|
| 108 |
+
url=result.get("url", ""),
|
| 109 |
+
content=result.get("content", ""),
|
| 110 |
+
score=result.get("score", 0.5),
|
| 111 |
+
published_date=result.get("published_date"),
|
| 112 |
+
freshness_score=freshness,
|
| 113 |
+
authority_score=result.get("authority_score", 0.5),
|
| 114 |
+
)
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# Step 5: Synthesize answer (if requested)
|
| 118 |
+
answer = None
|
| 119 |
+
citations = []
|
| 120 |
+
|
| 121 |
+
if body.include_answer and search_results:
|
| 122 |
+
answer, citations = await synthesize_answer(
|
| 123 |
+
query=body.query,
|
| 124 |
+
results=search_results,
|
| 125 |
+
temporal_context=temporal_context,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
processing_time = (time.perf_counter() - start_time) * 1000
|
| 129 |
+
|
| 130 |
+
return SearchResponse(
|
| 131 |
+
query=body.query,
|
| 132 |
+
answer=answer,
|
| 133 |
+
results=search_results,
|
| 134 |
+
citations=citations,
|
| 135 |
+
temporal_context=temporal_context,
|
| 136 |
+
processing_time_ms=processing_time,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@router.post(
|
| 145 |
+
"/search/raw",
|
| 146 |
+
response_model=SearchResponse,
|
| 147 |
+
summary="Search without synthesis",
|
| 148 |
+
description="Perform a search and return raw results without AI synthesis (faster).",
|
| 149 |
+
)
|
| 150 |
+
@limiter.limit("30/minute")
|
| 151 |
+
async def search_raw(request: Request, body: SearchRequest) -> SearchResponse:
|
| 152 |
+
"""Fast search without answer synthesis."""
|
| 153 |
+
body.include_answer = False
|
| 154 |
+
return await search(request, body)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@router.post(
|
| 158 |
+
"/search/stream",
|
| 159 |
+
summary="Search with streaming synthesis",
|
| 160 |
+
description="Perform a search and stream the AI-synthesized answer in real-time using SSE.",
|
| 161 |
+
)
|
| 162 |
+
@limiter.limit("30/minute")
|
| 163 |
+
async def search_stream(request: Request, body: SearchRequest):
|
| 164 |
+
"""
|
| 165 |
+
Streaming search with Server-Sent Events.
|
| 166 |
+
|
| 167 |
+
Returns results first, then streams the answer as it's generated.
|
| 168 |
+
"""
|
| 169 |
+
settings = get_settings()
|
| 170 |
+
|
| 171 |
+
async def event_generator():
|
| 172 |
+
try:
|
| 173 |
+
# Step 1: Analyze temporal intent
|
| 174 |
+
temporal_intent, temporal_urgency = detect_temporal_intent(body.query)
|
| 175 |
+
|
| 176 |
+
temporal_context = TemporalContext(
|
| 177 |
+
query_temporal_intent=temporal_intent,
|
| 178 |
+
temporal_urgency=temporal_urgency,
|
| 179 |
+
current_date=datetime.now().strftime("%Y-%m-%d"),
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# Step 2: Search sources
|
| 183 |
+
raw_results = []
|
| 184 |
+
|
| 185 |
+
if settings.tavily_api_key:
|
| 186 |
+
tavily_results = await search_tavily(
|
| 187 |
+
query=body.query,
|
| 188 |
+
max_results=settings.max_search_results,
|
| 189 |
+
freshness=body.freshness,
|
| 190 |
+
include_domains=body.include_domains,
|
| 191 |
+
exclude_domains=body.exclude_domains,
|
| 192 |
+
)
|
| 193 |
+
raw_results.extend(tavily_results)
|
| 194 |
+
|
| 195 |
+
if not raw_results:
|
| 196 |
+
ddg_results = await search_duckduckgo(
|
| 197 |
+
query=body.query,
|
| 198 |
+
max_results=settings.max_search_results,
|
| 199 |
+
)
|
| 200 |
+
raw_results.extend(ddg_results)
|
| 201 |
+
|
| 202 |
+
if not raw_results:
|
| 203 |
+
yield f"data: {json.dumps({'type': 'error', 'content': 'No results found'})}\n\n"
|
| 204 |
+
return
|
| 205 |
+
|
| 206 |
+
# Step 3: Rerank
|
| 207 |
+
ranked_results = await rerank_results(
|
| 208 |
+
query=body.query,
|
| 209 |
+
results=raw_results,
|
| 210 |
+
temporal_urgency=temporal_urgency,
|
| 211 |
+
max_results=body.max_results,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# Step 4: Convert to SearchResult models
|
| 215 |
+
search_results = []
|
| 216 |
+
for result in ranked_results:
|
| 217 |
+
freshness = calculate_freshness_score(result.get("published_date"))
|
| 218 |
+
search_results.append(
|
| 219 |
+
SearchResult(
|
| 220 |
+
title=result.get("title", ""),
|
| 221 |
+
url=result.get("url", ""),
|
| 222 |
+
content=result.get("content", ""),
|
| 223 |
+
score=result.get("score", 0.5),
|
| 224 |
+
published_date=result.get("published_date"),
|
| 225 |
+
freshness_score=freshness,
|
| 226 |
+
authority_score=result.get("authority_score", 0.5),
|
| 227 |
+
)
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# Send results first
|
| 231 |
+
results_data = {
|
| 232 |
+
"type": "results",
|
| 233 |
+
"results": [r.model_dump(mode="json") for r in search_results],
|
| 234 |
+
"temporal_context": temporal_context.model_dump(),
|
| 235 |
+
}
|
| 236 |
+
yield f"data: {json.dumps(results_data)}\n\n"
|
| 237 |
+
|
| 238 |
+
# Step 5: Stream answer
|
| 239 |
+
yield f"data: {json.dumps({'type': 'answer_start'})}\n\n"
|
| 240 |
+
|
| 241 |
+
async for chunk in synthesize_answer_stream(
|
| 242 |
+
query=body.query,
|
| 243 |
+
results=search_results,
|
| 244 |
+
temporal_context=temporal_context,
|
| 245 |
+
):
|
| 246 |
+
yield f"data: {json.dumps({'type': 'answer_chunk', 'content': chunk})}\n\n"
|
| 247 |
+
|
| 248 |
+
yield f"data: {json.dumps({'type': 'done'})}\n\n"
|
| 249 |
+
|
| 250 |
+
except Exception as e:
|
| 251 |
+
yield f"data: {json.dumps({'type': 'error', 'content': str(e)})}\n\n"
|
| 252 |
+
|
| 253 |
+
return StreamingResponse(
|
| 254 |
+
event_generator(),
|
| 255 |
+
media_type="text/event-stream",
|
| 256 |
+
headers={
|
| 257 |
+
"Cache-Control": "no-cache",
|
| 258 |
+
"Connection": "keep-alive",
|
| 259 |
+
"X-Accel-Buffering": "no",
|
| 260 |
+
},
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
# === Deep Research Endpoints ===
|
| 265 |
+
|
| 266 |
+
@router.post(
|
| 267 |
+
"/research/deep",
|
| 268 |
+
summary="Deep research with multi-dimensional analysis",
|
| 269 |
+
description="Decompose a query into dimensions, search each in parallel, and generate a comprehensive report.",
|
| 270 |
+
)
|
| 271 |
+
@limiter.limit("5/minute")
|
| 272 |
+
async def deep_research(request: Request, body: DeepResearchRequest):
|
| 273 |
+
"""
|
| 274 |
+
Run deep research with streaming progress updates.
|
| 275 |
+
|
| 276 |
+
Returns SSE events:
|
| 277 |
+
- plan_ready: Research plan with dimensions
|
| 278 |
+
- dimension_start/complete: Progress per dimension
|
| 279 |
+
- report_chunk: Streaming report content
|
| 280 |
+
- done: Final summary
|
| 281 |
+
"""
|
| 282 |
+
from app.agents.deep_research import run_deep_research
|
| 283 |
+
|
| 284 |
+
return StreamingResponse(
|
| 285 |
+
run_deep_research(
|
| 286 |
+
query=body.query,
|
| 287 |
+
max_dimensions=body.max_dimensions,
|
| 288 |
+
max_sources_per_dim=body.max_sources_per_dim,
|
| 289 |
+
max_total_searches=body.max_total_searches,
|
| 290 |
+
),
|
| 291 |
+
media_type="text/event-stream",
|
| 292 |
+
headers={
|
| 293 |
+
"Cache-Control": "no-cache",
|
| 294 |
+
"Connection": "keep-alive",
|
| 295 |
+
"X-Accel-Buffering": "no",
|
| 296 |
+
},
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
@router.post(
|
| 301 |
+
"/search/heavy",
|
| 302 |
+
summary="Heavy search with content scraping",
|
| 303 |
+
description="Search with full content extraction from top sources for richer answers.",
|
| 304 |
+
)
|
| 305 |
+
@limiter.limit("10/minute")
|
| 306 |
+
async def heavy_search(request: Request, body: SearchRequest):
|
| 307 |
+
"""
|
| 308 |
+
Heavy search with content scraping.
|
| 309 |
+
|
| 310 |
+
Scrapes full content from top results instead of just snippets,
|
| 311 |
+
providing richer context for answer generation.
|
| 312 |
+
"""
|
| 313 |
+
from app.agents.heavy_search import run_heavy_search
|
| 314 |
+
|
| 315 |
+
return StreamingResponse(
|
| 316 |
+
run_heavy_search(
|
| 317 |
+
query=body.query,
|
| 318 |
+
max_results=body.max_results,
|
| 319 |
+
max_scrape=5,
|
| 320 |
+
freshness=body.freshness,
|
| 321 |
+
),
|
| 322 |
+
media_type="text/event-stream",
|
| 323 |
+
headers={
|
| 324 |
+
"Cache-Control": "no-cache",
|
| 325 |
+
"Connection": "keep-alive",
|
| 326 |
+
"X-Accel-Buffering": "no",
|
| 327 |
+
},
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
@router.get(
|
| 332 |
+
"/images",
|
| 333 |
+
summary="Search for images",
|
| 334 |
+
description="Search for images related to a query using Brave Image Search.",
|
| 335 |
+
)
|
| 336 |
+
@limiter.limit("60/minute")
|
| 337 |
+
async def image_search(request: Request, query: str, max_results: int = 6):
|
| 338 |
+
"""
|
| 339 |
+
Search for images related to a query.
|
| 340 |
+
|
| 341 |
+
Returns a list of image results with thumbnails and source URLs.
|
| 342 |
+
"""
|
| 343 |
+
from app.sources.images import search_images
|
| 344 |
+
|
| 345 |
+
if not query:
|
| 346 |
+
raise HTTPException(status_code=400, detail="Query is required")
|
| 347 |
+
|
| 348 |
+
images = await search_images(query=query, max_results=max_results)
|
| 349 |
+
|
| 350 |
+
return {"query": query, "images": images}
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# === SearXNG Search (pure - no LLM) ===
|
| 354 |
+
|
| 355 |
+
@router.post(
|
| 356 |
+
"/search/searxng",
|
| 357 |
+
summary="Search using SearXNG + embedding reranking",
|
| 358 |
+
description="Uses SearXNG meta-search with embedding reranking. No LLM synthesis.",
|
| 359 |
+
)
|
| 360 |
+
@limiter.limit("20/minute")
|
| 361 |
+
async def searxng_search(request: Request, body: SearchRequest):
|
| 362 |
+
"""
|
| 363 |
+
Search using SearXNG with embedding reranking only.
|
| 364 |
+
|
| 365 |
+
This endpoint uses your SearXNG instance for 50+ results
|
| 366 |
+
and reranks with embeddings. No LLM synthesis.
|
| 367 |
+
"""
|
| 368 |
+
import json
|
| 369 |
+
from app.sources.searxng import search_searxng
|
| 370 |
+
from app.reranking.embeddings import compute_bi_encoder_scores
|
| 371 |
+
|
| 372 |
+
async def event_generator():
|
| 373 |
+
try:
|
| 374 |
+
# Step 1: Search SearXNG
|
| 375 |
+
yield f"data: {json.dumps({'type': 'status', 'message': 'Searching SearXNG...'})}\n\n"
|
| 376 |
+
|
| 377 |
+
time_range = {"day": "day", "week": "week", "month": "month"}.get(body.freshness)
|
| 378 |
+
raw_results = await search_searxng(
|
| 379 |
+
query=body.query,
|
| 380 |
+
max_results=50,
|
| 381 |
+
time_range=time_range,
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
if not raw_results:
|
| 385 |
+
yield f"data: {json.dumps({'type': 'error', 'message': 'No results from SearXNG'})}\n\n"
|
| 386 |
+
return
|
| 387 |
+
|
| 388 |
+
yield f"data: {json.dumps({'type': 'searxng_complete', 'count': len(raw_results)})}\n\n"
|
| 389 |
+
|
| 390 |
+
# Step 2: Rerank with embeddings
|
| 391 |
+
yield f"data: {json.dumps({'type': 'status', 'message': 'Reranking with embeddings...'})}\n\n"
|
| 392 |
+
|
| 393 |
+
docs = [f"{r.get('title', '')}. {r.get('content', '')[:500]}" for r in raw_results]
|
| 394 |
+
scores = compute_bi_encoder_scores(body.query, docs)
|
| 395 |
+
|
| 396 |
+
for i, result in enumerate(raw_results):
|
| 397 |
+
result["embedding_score"] = scores[i]
|
| 398 |
+
orig_score = result.get("score", 0.5)
|
| 399 |
+
result["score"] = (scores[i] * 0.7) + (orig_score * 0.3)
|
| 400 |
+
|
| 401 |
+
raw_results.sort(key=lambda x: x["score"], reverse=True)
|
| 402 |
+
final_results = raw_results[:body.max_results]
|
| 403 |
+
|
| 404 |
+
# Step 3: Return results (no LLM)
|
| 405 |
+
yield f"data: {json.dumps({'type': 'results', 'results': [{'title': r.get('title'), 'url': r.get('url'), 'content': r.get('content', '')[:300], 'score': round(r.get('score', 0), 3), 'source': r.get('source')} for r in final_results]})}\n\n"
|
| 406 |
+
|
| 407 |
+
yield f"data: {json.dumps({'type': 'done', 'total_raw': len(raw_results), 'returned': len(final_results)})}\n\n"
|
| 408 |
+
|
| 409 |
+
except Exception as e:
|
| 410 |
+
yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
|
| 411 |
+
|
| 412 |
+
return StreamingResponse(
|
| 413 |
+
event_generator(),
|
| 414 |
+
media_type="text/event-stream",
|
| 415 |
+
headers={
|
| 416 |
+
"Cache-Control": "no-cache",
|
| 417 |
+
"Connection": "keep-alive",
|
| 418 |
+
},
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# === Code Search (GitHub, StackOverflow) ===
|
| 423 |
+
|
| 424 |
+
@router.post(
|
| 425 |
+
"/search/code",
|
| 426 |
+
summary="Search code repositories and programming Q&A",
|
| 427 |
+
description="Uses SearXNG with GitHub, StackOverflow, and code-focused engines.",
|
| 428 |
+
)
|
| 429 |
+
@limiter.limit("20/minute")
|
| 430 |
+
async def code_search(request: Request, body: SearchRequest):
|
| 431 |
+
"""
|
| 432 |
+
Search for code, programming solutions, and documentation.
|
| 433 |
+
Uses GitHub, StackOverflow, GitLab, and other code-focused engines.
|
| 434 |
+
"""
|
| 435 |
+
import json
|
| 436 |
+
from app.sources.searxng import search_searxng
|
| 437 |
+
from app.reranking.embeddings import compute_bi_encoder_scores
|
| 438 |
+
|
| 439 |
+
async def event_generator():
|
| 440 |
+
try:
|
| 441 |
+
yield f"data: {json.dumps({'type': 'status', 'message': 'Searching code repositories...'})}\n\n"
|
| 442 |
+
|
| 443 |
+
# Use code-specific engines
|
| 444 |
+
raw_results = await search_searxng(
|
| 445 |
+
query=body.query,
|
| 446 |
+
max_results=50,
|
| 447 |
+
categories=["it"], # IT category includes code engines
|
| 448 |
+
engines=["github", "stackoverflow", "gitlab", "npm", "pypi", "crates.io", "packagist"],
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
if not raw_results:
|
| 452 |
+
yield f"data: {json.dumps({'type': 'error', 'message': 'No code results found'})}\n\n"
|
| 453 |
+
return
|
| 454 |
+
|
| 455 |
+
yield f"data: {json.dumps({'type': 'search_complete', 'count': len(raw_results)})}\n\n"
|
| 456 |
+
|
| 457 |
+
# Rerank with embeddings
|
| 458 |
+
yield f"data: {json.dumps({'type': 'status', 'message': 'Ranking by relevance...'})}\n\n"
|
| 459 |
+
|
| 460 |
+
docs = [f"{r.get('title', '')}. {r.get('content', '')[:500]}" for r in raw_results]
|
| 461 |
+
scores = compute_bi_encoder_scores(body.query, docs)
|
| 462 |
+
|
| 463 |
+
for i, result in enumerate(raw_results):
|
| 464 |
+
result["embedding_score"] = scores[i]
|
| 465 |
+
orig_score = result.get("score", 0.5)
|
| 466 |
+
result["score"] = (scores[i] * 0.7) + (orig_score * 0.3)
|
| 467 |
+
|
| 468 |
+
raw_results.sort(key=lambda x: x["score"], reverse=True)
|
| 469 |
+
final_results = raw_results[:body.max_results]
|
| 470 |
+
|
| 471 |
+
yield f"data: {json.dumps({'type': 'results', 'results': [{'title': r.get('title'), 'url': r.get('url'), 'content': r.get('content', '')[:300], 'score': round(r.get('score', 0), 3), 'source': r.get('source')} for r in final_results]})}\n\n"
|
| 472 |
+
yield f"data: {json.dumps({'type': 'done', 'total_raw': len(raw_results), 'returned': len(final_results)})}\n\n"
|
| 473 |
+
|
| 474 |
+
except Exception as e:
|
| 475 |
+
yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
|
| 476 |
+
|
| 477 |
+
return StreamingResponse(
|
| 478 |
+
event_generator(),
|
| 479 |
+
media_type="text/event-stream",
|
| 480 |
+
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
# === Academic Search (arXiv, Google Scholar) ===
|
| 485 |
+
|
| 486 |
+
@router.post(
|
| 487 |
+
"/search/academic",
|
| 488 |
+
summary="Search academic papers and research",
|
| 489 |
+
description="Uses SearXNG with arXiv, Google Scholar, Semantic Scholar, and academic engines.",
|
| 490 |
+
)
|
| 491 |
+
@limiter.limit("20/minute")
|
| 492 |
+
async def academic_search(request: Request, body: SearchRequest):
|
| 493 |
+
"""
|
| 494 |
+
Search for academic papers, research, and scientific content.
|
| 495 |
+
Uses arXiv, Google Scholar, Semantic Scholar, PubMed, and other academic engines.
|
| 496 |
+
"""
|
| 497 |
+
import json
|
| 498 |
+
from app.sources.searxng import search_searxng
|
| 499 |
+
from app.reranking.embeddings import compute_bi_encoder_scores
|
| 500 |
+
|
| 501 |
+
async def event_generator():
|
| 502 |
+
try:
|
| 503 |
+
yield f"data: {json.dumps({'type': 'status', 'message': 'Searching academic sources...'})}\n\n"
|
| 504 |
+
|
| 505 |
+
# Use academic engines
|
| 506 |
+
raw_results = await search_searxng(
|
| 507 |
+
query=body.query,
|
| 508 |
+
max_results=50,
|
| 509 |
+
categories=["science"],
|
| 510 |
+
engines=["arxiv", "google scholar", "semantic scholar", "pubmed", "base", "crossref"],
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
if not raw_results:
|
| 514 |
+
yield f"data: {json.dumps({'type': 'error', 'message': 'No academic results found'})}\n\n"
|
| 515 |
+
return
|
| 516 |
+
|
| 517 |
+
yield f"data: {json.dumps({'type': 'search_complete', 'count': len(raw_results)})}\n\n"
|
| 518 |
+
|
| 519 |
+
# Rerank with embeddings
|
| 520 |
+
yield f"data: {json.dumps({'type': 'status', 'message': 'Ranking by relevance...'})}\n\n"
|
| 521 |
+
|
| 522 |
+
docs = [f"{r.get('title', '')}. {r.get('content', '')[:500]}" for r in raw_results]
|
| 523 |
+
scores = compute_bi_encoder_scores(body.query, docs)
|
| 524 |
+
|
| 525 |
+
for i, result in enumerate(raw_results):
|
| 526 |
+
result["embedding_score"] = scores[i]
|
| 527 |
+
orig_score = result.get("score", 0.5)
|
| 528 |
+
result["score"] = (scores[i] * 0.7) + (orig_score * 0.3)
|
| 529 |
+
|
| 530 |
+
raw_results.sort(key=lambda x: x["score"], reverse=True)
|
| 531 |
+
final_results = raw_results[:body.max_results]
|
| 532 |
+
|
| 533 |
+
yield f"data: {json.dumps({'type': 'results', 'results': [{'title': r.get('title'), 'url': r.get('url'), 'content': r.get('content', '')[:300], 'score': round(r.get('score', 0), 3), 'source': r.get('source')} for r in final_results]})}\n\n"
|
| 534 |
+
yield f"data: {json.dumps({'type': 'done', 'total_raw': len(raw_results), 'returned': len(final_results)})}\n\n"
|
| 535 |
+
|
| 536 |
+
except Exception as e:
|
| 537 |
+
yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
|
| 538 |
+
|
| 539 |
+
return StreamingResponse(
|
| 540 |
+
event_generator(),
|
| 541 |
+
media_type="text/event-stream",
|
| 542 |
+
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
# === Browser Agent ===
|
| 547 |
+
|
| 548 |
+
@router.post(
|
| 549 |
+
"/agent/browse",
|
| 550 |
+
summary="Browser agent - navigate and extract from websites",
|
| 551 |
+
description="Uses E2B sandbox. stream_visual=true for Chrome with live video, false for Camoufox stealth.",
|
| 552 |
+
)
|
| 553 |
+
@limiter.limit("10/minute")
|
| 554 |
+
async def browser_agent(request: Request, body: BrowseRequest):
|
| 555 |
+
"""
|
| 556 |
+
Browser agent with two modes:
|
| 557 |
+
- stream_visual=true: Chrome with live video stream (5 min timeout)
|
| 558 |
+
- stream_visual=false: Camoufox stealth headless (faster, anti-bot)
|
| 559 |
+
"""
|
| 560 |
+
|
| 561 |
+
async def event_generator():
|
| 562 |
+
try:
|
| 563 |
+
if body.stream_visual:
|
| 564 |
+
from app.agents.browser_agent import run_browser_agent
|
| 565 |
+
async for event in run_browser_agent(body.task, body.url):
|
| 566 |
+
yield f"data: {json.dumps(event)}\n\n"
|
| 567 |
+
else:
|
| 568 |
+
from app.agents.browser_agent_v2 import run_browser_agent_v2
|
| 569 |
+
async for event in run_browser_agent_v2(body.task, body.url):
|
| 570 |
+
yield f"data: {json.dumps(event)}\n\n"
|
| 571 |
+
except Exception as e:
|
| 572 |
+
yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
|
| 573 |
+
|
| 574 |
+
return StreamingResponse(
|
| 575 |
+
event_generator(),
|
| 576 |
+
media_type="text/event-stream",
|
| 577 |
+
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
|
| 578 |
+
)
|
| 579 |
+
|
app/api/schemas.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic schemas for API request/response models."""
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import Literal
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# === Request Models ===
|
| 10 |
+
|
| 11 |
+
class SearchRequest(BaseModel):
|
| 12 |
+
"""Search request payload."""
|
| 13 |
+
|
| 14 |
+
query: str = Field(..., min_length=1, max_length=1000, description="Search query")
|
| 15 |
+
max_results: int = Field(default=10, ge=1, le=50, description="Maximum results to return")
|
| 16 |
+
freshness: Literal["day", "week", "month", "year", "any"] = Field(
|
| 17 |
+
default="any",
|
| 18 |
+
description="Filter results by recency"
|
| 19 |
+
)
|
| 20 |
+
include_domains: list[str] | None = Field(
|
| 21 |
+
default=None,
|
| 22 |
+
description="Only include results from these domains"
|
| 23 |
+
)
|
| 24 |
+
exclude_domains: list[str] | None = Field(
|
| 25 |
+
default=None,
|
| 26 |
+
description="Exclude results from these domains"
|
| 27 |
+
)
|
| 28 |
+
include_answer: bool = Field(
|
| 29 |
+
default=True,
|
| 30 |
+
description="Include AI-generated answer"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# === Response Models ===
|
| 35 |
+
|
| 36 |
+
class Citation(BaseModel):
|
| 37 |
+
"""Citation reference for the answer."""
|
| 38 |
+
|
| 39 |
+
index: int = Field(..., description="Citation index (1-based)")
|
| 40 |
+
url: str = Field(..., description="Source URL")
|
| 41 |
+
title: str = Field(..., description="Source title")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class TemporalContext(BaseModel):
|
| 45 |
+
"""Temporal metadata about the search."""
|
| 46 |
+
|
| 47 |
+
query_temporal_intent: Literal["current", "historical", "neutral"] = Field(
|
| 48 |
+
...,
|
| 49 |
+
description="Detected temporal intent of the query"
|
| 50 |
+
)
|
| 51 |
+
temporal_urgency: float = Field(
|
| 52 |
+
...,
|
| 53 |
+
ge=0.0,
|
| 54 |
+
le=1.0,
|
| 55 |
+
description="How important freshness is for this query (0-1)"
|
| 56 |
+
)
|
| 57 |
+
current_date: str = Field(..., description="Current date for context")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class SearchResult(BaseModel):
|
| 61 |
+
"""Individual search result."""
|
| 62 |
+
|
| 63 |
+
title: str = Field(..., description="Result title")
|
| 64 |
+
url: str = Field(..., description="Result URL")
|
| 65 |
+
content: str = Field(..., description="Result content/snippet")
|
| 66 |
+
score: float = Field(..., ge=0.0, le=1.0, description="Overall relevance score")
|
| 67 |
+
published_date: datetime | None = Field(
|
| 68 |
+
default=None,
|
| 69 |
+
description="Publication date if available"
|
| 70 |
+
)
|
| 71 |
+
freshness_score: float = Field(
|
| 72 |
+
default=0.5,
|
| 73 |
+
ge=0.0,
|
| 74 |
+
le=1.0,
|
| 75 |
+
description="How fresh/recent the content is"
|
| 76 |
+
)
|
| 77 |
+
authority_score: float = Field(
|
| 78 |
+
default=0.5,
|
| 79 |
+
ge=0.0,
|
| 80 |
+
le=1.0,
|
| 81 |
+
description="Domain authority/trust score"
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class SearchResponse(BaseModel):
|
| 86 |
+
"""Complete search response."""
|
| 87 |
+
|
| 88 |
+
query: str = Field(..., description="Original query")
|
| 89 |
+
answer: str | None = Field(
|
| 90 |
+
default=None,
|
| 91 |
+
description="AI-generated answer synthesized from results"
|
| 92 |
+
)
|
| 93 |
+
results: list[SearchResult] = Field(
|
| 94 |
+
default_factory=list,
|
| 95 |
+
description="Ranked search results"
|
| 96 |
+
)
|
| 97 |
+
citations: list[Citation] = Field(
|
| 98 |
+
default_factory=list,
|
| 99 |
+
description="Citations referenced in the answer"
|
| 100 |
+
)
|
| 101 |
+
temporal_context: TemporalContext | None = Field(
|
| 102 |
+
default=None,
|
| 103 |
+
description="Temporal analysis metadata"
|
| 104 |
+
)
|
| 105 |
+
processing_time_ms: float = Field(..., description="Total processing time in milliseconds")
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
class ErrorResponse(BaseModel):
|
| 109 |
+
"""Error response model."""
|
| 110 |
+
|
| 111 |
+
error: str = Field(..., description="Error message")
|
| 112 |
+
detail: str | None = Field(default=None, description="Detailed error information")
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# === Deep Research Models ===
|
| 116 |
+
|
| 117 |
+
class DeepResearchRequest(BaseModel):
|
| 118 |
+
"""Deep research request payload."""
|
| 119 |
+
|
| 120 |
+
query: str = Field(..., min_length=1, max_length=2000, description="Research query")
|
| 121 |
+
max_dimensions: int = Field(
|
| 122 |
+
default=5,
|
| 123 |
+
ge=2,
|
| 124 |
+
le=8,
|
| 125 |
+
description="Maximum research dimensions to explore"
|
| 126 |
+
)
|
| 127 |
+
max_sources_per_dim: int = Field(
|
| 128 |
+
default=5,
|
| 129 |
+
ge=1,
|
| 130 |
+
le=10,
|
| 131 |
+
description="Maximum sources per dimension"
|
| 132 |
+
)
|
| 133 |
+
max_total_searches: int = Field(
|
| 134 |
+
default=20,
|
| 135 |
+
ge=5,
|
| 136 |
+
le=30,
|
| 137 |
+
description="Maximum total API searches"
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# === Browser Agent Models ===
|
| 142 |
+
|
| 143 |
+
class BrowseRequest(BaseModel):
|
| 144 |
+
"""Browser agent request payload."""
|
| 145 |
+
|
| 146 |
+
task: str = Field(
|
| 147 |
+
...,
|
| 148 |
+
min_length=1,
|
| 149 |
+
max_length=2000,
|
| 150 |
+
description="Task description (e.g., 'Get the top 5 headlines')"
|
| 151 |
+
)
|
| 152 |
+
url: str | None = Field(
|
| 153 |
+
default=None,
|
| 154 |
+
description="URL to navigate to"
|
| 155 |
+
)
|
| 156 |
+
stream_visual: bool = Field(
|
| 157 |
+
default=False,
|
| 158 |
+
description="Use Chrome with live video stream (less stealth, but visual)"
|
| 159 |
+
)
|
app/config.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application configuration using pydantic-settings."""
|
| 2 |
+
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
from typing import Literal
|
| 5 |
+
|
| 6 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Settings(BaseSettings):
|
| 10 |
+
"""Application settings loaded from environment variables."""
|
| 11 |
+
|
| 12 |
+
model_config = SettingsConfigDict(
|
| 13 |
+
env_file=".env",
|
| 14 |
+
env_file_encoding="utf-8",
|
| 15 |
+
extra="ignore",
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# API Keys - Search Sources
|
| 19 |
+
tavily_api_key: str = ""
|
| 20 |
+
brave_api_key: str = "" # 2000 free/month
|
| 21 |
+
|
| 22 |
+
# SearXNG (self-hosted meta-search - uses your HF Space by default)
|
| 23 |
+
searxng_url: str = "https://madras1-searxng-space.hf.space"
|
| 24 |
+
serper_api_key: str | None = None
|
| 25 |
+
|
| 26 |
+
# E2B Desktop (cloud browser for browser agent)
|
| 27 |
+
e2b_api_key: str = ""
|
| 28 |
+
|
| 29 |
+
# API Keys - LLM Providers
|
| 30 |
+
groq_api_key: str | None = None
|
| 31 |
+
openrouter_api_key: str | None = None
|
| 32 |
+
|
| 33 |
+
# LLM Configuration
|
| 34 |
+
llm_provider: Literal["groq", "openrouter"] = "openrouter"
|
| 35 |
+
llm_model: str = "stepfun/step-3.5-flash:free"
|
| 36 |
+
|
| 37 |
+
# Reranking Models (lightweight for HF Spaces)
|
| 38 |
+
bi_encoder_model: str = "Madras1/minilm-gooaq-mnr-v5" # Fine-tuned on GooAQ + NQ
|
| 39 |
+
cross_encoder_model: str = "cross-encoder/ms-marco-MiniLM-L6-v2" # ~90MB
|
| 40 |
+
|
| 41 |
+
# Temporal Settings
|
| 42 |
+
default_freshness_half_life: int = 30 # days
|
| 43 |
+
|
| 44 |
+
# API Settings
|
| 45 |
+
max_search_results: int = 20
|
| 46 |
+
max_final_results: int = 10
|
| 47 |
+
|
| 48 |
+
# Deep Research Settings
|
| 49 |
+
max_research_dimensions: int = 6
|
| 50 |
+
max_tavily_calls_per_research: int = 20
|
| 51 |
+
deep_research_model: str | None = None # Use main model if None
|
| 52 |
+
|
| 53 |
+
@property
|
| 54 |
+
def llm_api_key(self) -> str:
|
| 55 |
+
"""Get the appropriate API key based on provider."""
|
| 56 |
+
if self.llm_provider == "groq":
|
| 57 |
+
return self.groq_api_key or ""
|
| 58 |
+
return self.openrouter_api_key or ""
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@lru_cache
|
| 62 |
+
def get_settings() -> Settings:
|
| 63 |
+
"""Get cached settings instance."""
|
| 64 |
+
return Settings()
|
app/main.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Lancer API - Main FastAPI application."""
|
| 2 |
+
|
| 3 |
+
from contextlib import asynccontextmanager
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
from fastapi import FastAPI
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from slowapi import _rate_limit_exceeded_handler
|
| 9 |
+
from slowapi.errors import RateLimitExceeded
|
| 10 |
+
|
| 11 |
+
from app.api.routes import search
|
| 12 |
+
from app.config import get_settings
|
| 13 |
+
from app.middleware.rate_limiter import limiter
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@asynccontextmanager
|
| 17 |
+
async def lifespan(app: FastAPI):
|
| 18 |
+
"""Application lifespan events."""
|
| 19 |
+
# Startup
|
| 20 |
+
settings = get_settings()
|
| 21 |
+
print(f"🚀 Lancer API starting...")
|
| 22 |
+
print(f" LLM Provider: {settings.llm_provider}")
|
| 23 |
+
print(f" LLM Model: {settings.llm_model}")
|
| 24 |
+
print(f" Rate limiting: enabled")
|
| 25 |
+
yield
|
| 26 |
+
# Shutdown
|
| 27 |
+
print("👋 Lancer API shutting down...")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
app = FastAPI(
|
| 31 |
+
title="Lancer Search API",
|
| 32 |
+
description="Advanced AI-powered search API with temporal intelligence",
|
| 33 |
+
version="0.1.0",
|
| 34 |
+
lifespan=lifespan,
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Rate limiting
|
| 38 |
+
app.state.limiter = limiter
|
| 39 |
+
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
| 40 |
+
|
| 41 |
+
# CORS middleware
|
| 42 |
+
app.add_middleware(
|
| 43 |
+
CORSMiddleware,
|
| 44 |
+
allow_origins=["*"],
|
| 45 |
+
allow_credentials=True,
|
| 46 |
+
allow_methods=["*"],
|
| 47 |
+
allow_headers=["*"],
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Include routers
|
| 51 |
+
app.include_router(search.router, prefix="/api/v1", tags=["search"])
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@app.get("/health")
|
| 55 |
+
async def health_check():
|
| 56 |
+
"""Health check endpoint."""
|
| 57 |
+
return {
|
| 58 |
+
"status": "healthy",
|
| 59 |
+
"timestamp": datetime.now().isoformat(),
|
| 60 |
+
"version": "0.1.0",
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@app.get("/")
|
| 65 |
+
async def root():
|
| 66 |
+
"""Root endpoint with API info."""
|
| 67 |
+
return {
|
| 68 |
+
"name": "Lancer Search API",
|
| 69 |
+
"version": "0.1.0",
|
| 70 |
+
"docs": "/docs",
|
| 71 |
+
"health": "/health",
|
| 72 |
+
}
|
app/middleware/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Middleware package."""
|
app/middleware/rate_limiter.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Rate limiting middleware using SlowAPI.
|
| 2 |
+
|
| 3 |
+
Provides IP-based rate limiting for all API endpoints.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from slowapi import Limiter
|
| 7 |
+
from slowapi.util import get_remote_address
|
| 8 |
+
from slowapi.errors import RateLimitExceeded
|
| 9 |
+
from slowapi.middleware import SlowAPIMiddleware
|
| 10 |
+
from fastapi import Request
|
| 11 |
+
from fastapi.responses import JSONResponse
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Create limiter instance with IP-based key
|
| 15 |
+
limiter = Limiter(
|
| 16 |
+
key_func=get_remote_address,
|
| 17 |
+
default_limits=["100/minute"],
|
| 18 |
+
storage_uri="memory://", # Use memory storage (OK for single instance on HF Spaces)
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
|
| 23 |
+
"""Custom handler for rate limit exceeded errors."""
|
| 24 |
+
return JSONResponse(
|
| 25 |
+
status_code=429,
|
| 26 |
+
content={
|
| 27 |
+
"error": "rate_limit_exceeded",
|
| 28 |
+
"message": f"Rate limit exceeded: {exc.detail}",
|
| 29 |
+
"retry_after": getattr(exc, "retry_after", 60),
|
| 30 |
+
},
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# Rate limit decorators for different endpoints
|
| 35 |
+
LIMITS = {
|
| 36 |
+
"search": "30/minute",
|
| 37 |
+
"heavy": "10/minute",
|
| 38 |
+
"deep": "5/minute",
|
| 39 |
+
"images": "60/minute",
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def get_limiter():
|
| 44 |
+
"""Get the limiter instance for dependency injection."""
|
| 45 |
+
return limiter
|
app/reranking/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Reranking module."""
|
app/reranking/authority_scorer.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Domain authority scoring.
|
| 2 |
+
|
| 3 |
+
Assigns trust/authority scores to domains based on known reliable sources.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from urllib.parse import urlparse
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# High authority domains (trusted sources)
|
| 10 |
+
HIGH_AUTHORITY_DOMAINS = {
|
| 11 |
+
# Academic & Research
|
| 12 |
+
".edu": 0.9,
|
| 13 |
+
".gov": 0.9,
|
| 14 |
+
".ac.uk": 0.85,
|
| 15 |
+
|
| 16 |
+
# Major tech companies
|
| 17 |
+
"github.com": 0.8,
|
| 18 |
+
"stackoverflow.com": 0.8,
|
| 19 |
+
"docs.python.org": 0.85,
|
| 20 |
+
"developer.mozilla.org": 0.85,
|
| 21 |
+
"arxiv.org": 0.9,
|
| 22 |
+
|
| 23 |
+
# Major news sources
|
| 24 |
+
"reuters.com": 0.8,
|
| 25 |
+
"bbc.com": 0.75,
|
| 26 |
+
"nytimes.com": 0.75,
|
| 27 |
+
"theguardian.com": 0.75,
|
| 28 |
+
|
| 29 |
+
# Reference
|
| 30 |
+
"wikipedia.org": 0.7,
|
| 31 |
+
"britannica.com": 0.8,
|
| 32 |
+
|
| 33 |
+
# AI/ML specific
|
| 34 |
+
"openai.com": 0.85,
|
| 35 |
+
"anthropic.com": 0.85,
|
| 36 |
+
"huggingface.co": 0.8,
|
| 37 |
+
"deepmind.google": 0.85,
|
| 38 |
+
"ai.meta.com": 0.8,
|
| 39 |
+
|
| 40 |
+
# Tech publications
|
| 41 |
+
"techcrunch.com": 0.7,
|
| 42 |
+
"wired.com": 0.7,
|
| 43 |
+
"arstechnica.com": 0.75,
|
| 44 |
+
"theverge.com": 0.7,
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
# Low authority patterns (less reliable)
|
| 48 |
+
LOW_AUTHORITY_PATTERNS = [
|
| 49 |
+
"medium.com", # User-generated, variable quality
|
| 50 |
+
"reddit.com", # Forum, variable quality
|
| 51 |
+
"quora.com", # Q&A, variable quality
|
| 52 |
+
"blogspot.com",
|
| 53 |
+
"wordpress.com",
|
| 54 |
+
"tumblr.com",
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def calculate_authority_score(url: str) -> float:
|
| 59 |
+
"""
|
| 60 |
+
Calculate domain authority score for a URL.
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
url: The URL to score
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
Authority score between 0.0 and 1.0
|
| 67 |
+
"""
|
| 68 |
+
if not url:
|
| 69 |
+
return 0.5
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
parsed = urlparse(url)
|
| 73 |
+
domain = parsed.netloc.lower()
|
| 74 |
+
|
| 75 |
+
# Remove www. prefix
|
| 76 |
+
if domain.startswith("www."):
|
| 77 |
+
domain = domain[4:]
|
| 78 |
+
|
| 79 |
+
# Check for exact domain matches
|
| 80 |
+
for known_domain, score in HIGH_AUTHORITY_DOMAINS.items():
|
| 81 |
+
if domain == known_domain or domain.endswith(known_domain):
|
| 82 |
+
return score
|
| 83 |
+
|
| 84 |
+
# Check for TLD-based authority (.edu, .gov, etc.)
|
| 85 |
+
for tld, score in HIGH_AUTHORITY_DOMAINS.items():
|
| 86 |
+
if tld.startswith(".") and domain.endswith(tld):
|
| 87 |
+
return score
|
| 88 |
+
|
| 89 |
+
# Check for low authority patterns
|
| 90 |
+
for pattern in LOW_AUTHORITY_PATTERNS:
|
| 91 |
+
if pattern in domain:
|
| 92 |
+
return 0.4
|
| 93 |
+
|
| 94 |
+
# Default score for unknown domains
|
| 95 |
+
return 0.5
|
| 96 |
+
|
| 97 |
+
except Exception:
|
| 98 |
+
return 0.5
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def get_domain_category(url: str) -> str:
|
| 102 |
+
"""
|
| 103 |
+
Get a category label for the domain.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
url: The URL to categorize
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
Category string like "Academic", "News", "Tech", etc.
|
| 110 |
+
"""
|
| 111 |
+
if not url:
|
| 112 |
+
return "Unknown"
|
| 113 |
+
|
| 114 |
+
try:
|
| 115 |
+
parsed = urlparse(url)
|
| 116 |
+
domain = parsed.netloc.lower()
|
| 117 |
+
|
| 118 |
+
if ".edu" in domain or ".ac.uk" in domain or "arxiv" in domain:
|
| 119 |
+
return "Academic"
|
| 120 |
+
elif ".gov" in domain:
|
| 121 |
+
return "Government"
|
| 122 |
+
elif any(site in domain for site in ["github", "stackoverflow", "docs."]):
|
| 123 |
+
return "Developer"
|
| 124 |
+
elif any(site in domain for site in ["reuters", "bbc", "nytimes", "cnn", "guardian"]):
|
| 125 |
+
return "News"
|
| 126 |
+
elif any(site in domain for site in ["openai", "anthropic", "huggingface", "deepmind"]):
|
| 127 |
+
return "AI/ML"
|
| 128 |
+
elif "wikipedia" in domain:
|
| 129 |
+
return "Reference"
|
| 130 |
+
else:
|
| 131 |
+
return "General"
|
| 132 |
+
|
| 133 |
+
except Exception:
|
| 134 |
+
return "Unknown"
|
app/reranking/embeddings.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Embedding-based reranking using sentence-transformers.
|
| 2 |
+
|
| 3 |
+
Provides bi-encoder and cross-encoder reranking for better relevance scoring.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
from app.config import get_settings
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@lru_cache(maxsize=1)
|
| 15 |
+
def get_bi_encoder():
|
| 16 |
+
"""Load and cache the bi-encoder model."""
|
| 17 |
+
from sentence_transformers import SentenceTransformer
|
| 18 |
+
settings = get_settings()
|
| 19 |
+
return SentenceTransformer(settings.bi_encoder_model)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@lru_cache(maxsize=1)
|
| 23 |
+
def get_cross_encoder():
|
| 24 |
+
"""Load and cache the cross-encoder model."""
|
| 25 |
+
from sentence_transformers import CrossEncoder
|
| 26 |
+
settings = get_settings()
|
| 27 |
+
return CrossEncoder(settings.cross_encoder_model)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def compute_bi_encoder_scores(
|
| 31 |
+
query: str,
|
| 32 |
+
documents: list[str],
|
| 33 |
+
) -> list[float]:
|
| 34 |
+
"""
|
| 35 |
+
Compute semantic similarity scores using bi-encoder.
|
| 36 |
+
|
| 37 |
+
Fast but less accurate than cross-encoder.
|
| 38 |
+
Good for initial filtering of large result sets.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
query: Search query
|
| 42 |
+
documents: List of document texts
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
List of similarity scores (0-1)
|
| 46 |
+
"""
|
| 47 |
+
if not documents:
|
| 48 |
+
return []
|
| 49 |
+
|
| 50 |
+
model = get_bi_encoder()
|
| 51 |
+
|
| 52 |
+
# Encode query and documents
|
| 53 |
+
query_embedding = model.encode(query, normalize_embeddings=True)
|
| 54 |
+
doc_embeddings = model.encode(documents, normalize_embeddings=True)
|
| 55 |
+
|
| 56 |
+
# Compute cosine similarities (embeddings are normalized, so dot product = cosine)
|
| 57 |
+
similarities = np.dot(doc_embeddings, query_embedding)
|
| 58 |
+
|
| 59 |
+
# Convert to list and ensure values are in [0, 1]
|
| 60 |
+
scores = [(float(s) + 1) / 2 for s in similarities] # Map from [-1, 1] to [0, 1]
|
| 61 |
+
|
| 62 |
+
return scores
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def compute_cross_encoder_scores(
|
| 66 |
+
query: str,
|
| 67 |
+
documents: list[str],
|
| 68 |
+
) -> list[float]:
|
| 69 |
+
"""
|
| 70 |
+
Compute relevance scores using cross-encoder.
|
| 71 |
+
|
| 72 |
+
More accurate than bi-encoder but slower.
|
| 73 |
+
Use after initial filtering for precise ranking.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
query: Search query
|
| 77 |
+
documents: List of document texts
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
List of relevance scores (0-1)
|
| 81 |
+
"""
|
| 82 |
+
if not documents:
|
| 83 |
+
return []
|
| 84 |
+
|
| 85 |
+
model = get_cross_encoder()
|
| 86 |
+
|
| 87 |
+
# Create query-document pairs
|
| 88 |
+
pairs = [[query, doc] for doc in documents]
|
| 89 |
+
|
| 90 |
+
# Get scores
|
| 91 |
+
scores = model.predict(pairs)
|
| 92 |
+
|
| 93 |
+
# Normalize to [0, 1] using sigmoid if needed
|
| 94 |
+
min_score = float(np.min(scores))
|
| 95 |
+
max_score = float(np.max(scores))
|
| 96 |
+
|
| 97 |
+
if max_score > min_score:
|
| 98 |
+
normalized = [(float(s) - min_score) / (max_score - min_score) for s in scores]
|
| 99 |
+
else:
|
| 100 |
+
normalized = [0.5] * len(scores)
|
| 101 |
+
|
| 102 |
+
return normalized
|
app/reranking/pipeline.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Multi-stage reranking pipeline.
|
| 2 |
+
|
| 3 |
+
Implements a 3-stage reranking approach:
|
| 4 |
+
1. Bi-Encoder: Fast semantic similarity (for large result sets)
|
| 5 |
+
2. Cross-Encoder: Accurate relevance scoring
|
| 6 |
+
3. Temporal + Authority: Freshness and domain trust weighting
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
from app.temporal.freshness_scorer import calculate_freshness_score, adjust_score_by_freshness
|
| 13 |
+
from app.reranking.authority_scorer import calculate_authority_score
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# Flag to enable/disable embedding-based reranking
|
| 18 |
+
ENABLE_EMBEDDING_RERANKING = True
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
async def rerank_results(
|
| 22 |
+
query: str,
|
| 23 |
+
results: list[dict],
|
| 24 |
+
temporal_urgency: float = 0.5,
|
| 25 |
+
max_results: int = 10,
|
| 26 |
+
use_embeddings: bool = True,
|
| 27 |
+
) -> list[dict]:
|
| 28 |
+
"""
|
| 29 |
+
Apply multi-stage reranking to search results.
|
| 30 |
+
|
| 31 |
+
Pipeline:
|
| 32 |
+
1. Bi-encoder: Quick semantic filtering (if results > 20)
|
| 33 |
+
2. Cross-encoder: Precise relevance scoring (top candidates)
|
| 34 |
+
3. Temporal + Authority: Freshness and trust weighting
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
query: Original search query
|
| 38 |
+
results: Raw search results
|
| 39 |
+
temporal_urgency: How important freshness is (0-1)
|
| 40 |
+
max_results: Maximum results to return
|
| 41 |
+
use_embeddings: Whether to use embedding models
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
Reranked results with updated scores
|
| 45 |
+
"""
|
| 46 |
+
if not results:
|
| 47 |
+
return []
|
| 48 |
+
|
| 49 |
+
scored_results = results.copy()
|
| 50 |
+
|
| 51 |
+
# Stage 1 & 2: Embedding-based reranking
|
| 52 |
+
if use_embeddings and ENABLE_EMBEDDING_RERANKING:
|
| 53 |
+
try:
|
| 54 |
+
scored_results = await _apply_embedding_reranking(query, scored_results)
|
| 55 |
+
logger.info(f"Applied embedding reranking to {len(scored_results)} results")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.warning(f"Embedding reranking failed, using fallback: {e}")
|
| 58 |
+
# Fall through to basic scoring
|
| 59 |
+
|
| 60 |
+
# Stage 3: Apply temporal + authority scoring
|
| 61 |
+
for result in scored_results:
|
| 62 |
+
# Calculate freshness score
|
| 63 |
+
freshness = calculate_freshness_score(result.get("published_date"))
|
| 64 |
+
result["freshness_score"] = freshness
|
| 65 |
+
|
| 66 |
+
# Calculate authority score
|
| 67 |
+
authority = calculate_authority_score(result.get("url", ""))
|
| 68 |
+
result["authority_score"] = authority
|
| 69 |
+
|
| 70 |
+
# Get base score (from search source or embedding)
|
| 71 |
+
base_score = result.get("score", 0.5)
|
| 72 |
+
|
| 73 |
+
# Adjust for freshness based on temporal urgency
|
| 74 |
+
adjusted_score = adjust_score_by_freshness(
|
| 75 |
+
base_score=base_score,
|
| 76 |
+
freshness_score=freshness,
|
| 77 |
+
temporal_urgency=temporal_urgency,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Also factor in authority (10% weight)
|
| 81 |
+
final_score = (adjusted_score * 0.9) + (authority * 0.1)
|
| 82 |
+
result["score"] = final_score
|
| 83 |
+
|
| 84 |
+
# Sort by final score (descending)
|
| 85 |
+
scored_results.sort(key=lambda x: x["score"], reverse=True)
|
| 86 |
+
|
| 87 |
+
return scored_results[:max_results]
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
async def _apply_embedding_reranking(
|
| 91 |
+
query: str,
|
| 92 |
+
results: list[dict],
|
| 93 |
+
) -> list[dict]:
|
| 94 |
+
"""Apply bi-encoder and cross-encoder reranking."""
|
| 95 |
+
from app.reranking.embeddings import compute_bi_encoder_scores, compute_cross_encoder_scores
|
| 96 |
+
|
| 97 |
+
# Extract document contents for embedding
|
| 98 |
+
documents = [
|
| 99 |
+
f"{r.get('title', '')}. {r.get('content', '')[:500]}"
|
| 100 |
+
for r in results
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
# Stage 1: Bi-encoder for initial scoring (fast)
|
| 104 |
+
if len(results) > 15:
|
| 105 |
+
bi_scores = compute_bi_encoder_scores(query, documents)
|
| 106 |
+
for i, result in enumerate(results):
|
| 107 |
+
result["bi_encoder_score"] = bi_scores[i]
|
| 108 |
+
|
| 109 |
+
# Sort by bi-encoder and keep top 15 for cross-encoder
|
| 110 |
+
results.sort(key=lambda x: x.get("bi_encoder_score", 0), reverse=True)
|
| 111 |
+
results = results[:15]
|
| 112 |
+
documents = documents[:15]
|
| 113 |
+
|
| 114 |
+
# Stage 2: Cross-encoder for precise scoring (slower but accurate)
|
| 115 |
+
cross_scores = compute_cross_encoder_scores(query, documents)
|
| 116 |
+
|
| 117 |
+
for i, result in enumerate(results):
|
| 118 |
+
# Blend cross-encoder score with original source score
|
| 119 |
+
original_score = result.get("score", 0.5)
|
| 120 |
+
cross_score = cross_scores[i]
|
| 121 |
+
|
| 122 |
+
# Cross-encoder gets 70% weight, original 30%
|
| 123 |
+
result["score"] = (cross_score * 0.7) + (original_score * 0.3)
|
| 124 |
+
result["cross_encoder_score"] = cross_score
|
| 125 |
+
|
| 126 |
+
return results
|
| 127 |
+
|
app/sources/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Search sources module."""
|
app/sources/aggregator.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Multi-source search aggregator.
|
| 2 |
+
|
| 3 |
+
Combines results from multiple search sources in parallel.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
from typing import Optional
|
| 8 |
+
from urllib.parse import urlparse
|
| 9 |
+
|
| 10 |
+
from app.config import get_settings
|
| 11 |
+
from app.sources.tavily import search_tavily
|
| 12 |
+
from app.sources.brave import search_brave
|
| 13 |
+
from app.sources.duckduckgo import search_duckduckgo
|
| 14 |
+
from app.sources.wikipedia import search_wikipedia
|
| 15 |
+
from app.sources.searxng import search_searxng
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
async def aggregate_search(
|
| 19 |
+
query: str,
|
| 20 |
+
max_results: int = 15,
|
| 21 |
+
freshness: str = "any",
|
| 22 |
+
include_wikipedia: bool = True,
|
| 23 |
+
include_domains: Optional[list[str]] = None,
|
| 24 |
+
exclude_domains: Optional[list[str]] = None,
|
| 25 |
+
) -> list[dict]:
|
| 26 |
+
"""
|
| 27 |
+
Aggregate search results from multiple sources in parallel.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
query: Search query
|
| 31 |
+
max_results: Maximum total results to return
|
| 32 |
+
freshness: Freshness filter (day, week, month, year, any)
|
| 33 |
+
include_wikipedia: Whether to include Wikipedia results
|
| 34 |
+
include_domains: Only include these domains (Tavily only)
|
| 35 |
+
exclude_domains: Exclude these domains (Tavily only)
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Deduplicated, merged list of search results
|
| 39 |
+
"""
|
| 40 |
+
settings = get_settings()
|
| 41 |
+
|
| 42 |
+
# Build list of search tasks
|
| 43 |
+
tasks = []
|
| 44 |
+
source_names = []
|
| 45 |
+
|
| 46 |
+
# SearXNG (if configured - free, high volume)
|
| 47 |
+
if hasattr(settings, 'searxng_url') and settings.searxng_url:
|
| 48 |
+
time_range = {"day": "day", "week": "week", "month": "month"}.get(freshness)
|
| 49 |
+
tasks.append(search_searxng(
|
| 50 |
+
query=query,
|
| 51 |
+
max_results=15,
|
| 52 |
+
time_range=time_range,
|
| 53 |
+
))
|
| 54 |
+
source_names.append("searxng")
|
| 55 |
+
|
| 56 |
+
# Tavily (primary source - if API key available)
|
| 57 |
+
if settings.tavily_api_key:
|
| 58 |
+
tasks.append(search_tavily(
|
| 59 |
+
query=query,
|
| 60 |
+
max_results=12, # Primary source
|
| 61 |
+
freshness=freshness,
|
| 62 |
+
include_domains=include_domains,
|
| 63 |
+
exclude_domains=exclude_domains,
|
| 64 |
+
))
|
| 65 |
+
source_names.append("tavily")
|
| 66 |
+
|
| 67 |
+
# Brave (secondary - limited quota, use sparingly)
|
| 68 |
+
if settings.brave_api_key:
|
| 69 |
+
tasks.append(search_brave(
|
| 70 |
+
query=query,
|
| 71 |
+
max_results=5, # Reduced to save quota
|
| 72 |
+
freshness=freshness,
|
| 73 |
+
))
|
| 74 |
+
source_names.append("brave")
|
| 75 |
+
|
| 76 |
+
# DuckDuckGo (always available, free)
|
| 77 |
+
tasks.append(search_duckduckgo(
|
| 78 |
+
query=query,
|
| 79 |
+
max_results=12, # Free, can use more
|
| 80 |
+
))
|
| 81 |
+
source_names.append("duckduckgo")
|
| 82 |
+
|
| 83 |
+
# Wikipedia (for context/background)
|
| 84 |
+
if include_wikipedia:
|
| 85 |
+
tasks.append(search_wikipedia(
|
| 86 |
+
query=query,
|
| 87 |
+
max_results=5,
|
| 88 |
+
))
|
| 89 |
+
source_names.append("wikipedia")
|
| 90 |
+
|
| 91 |
+
# Run all searches in parallel
|
| 92 |
+
results_lists = await asyncio.gather(*tasks, return_exceptions=True)
|
| 93 |
+
|
| 94 |
+
# Merge results
|
| 95 |
+
all_results = []
|
| 96 |
+
for i, results in enumerate(results_lists):
|
| 97 |
+
if isinstance(results, Exception):
|
| 98 |
+
print(f"Source {source_names[i]} failed: {results}")
|
| 99 |
+
continue
|
| 100 |
+
if results:
|
| 101 |
+
all_results.extend(results)
|
| 102 |
+
|
| 103 |
+
# Deduplicate by URL
|
| 104 |
+
seen_urls = set()
|
| 105 |
+
unique_results = []
|
| 106 |
+
|
| 107 |
+
for result in all_results:
|
| 108 |
+
url = result.get("url", "")
|
| 109 |
+
normalized_url = _normalize_url(url)
|
| 110 |
+
|
| 111 |
+
if normalized_url not in seen_urls:
|
| 112 |
+
seen_urls.add(normalized_url)
|
| 113 |
+
unique_results.append(result)
|
| 114 |
+
|
| 115 |
+
# Sort by score (descending)
|
| 116 |
+
unique_results.sort(key=lambda x: x.get("score", 0), reverse=True)
|
| 117 |
+
|
| 118 |
+
return unique_results[:max_results]
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def _normalize_url(url: str) -> str:
|
| 122 |
+
"""Normalize URL for deduplication."""
|
| 123 |
+
try:
|
| 124 |
+
parsed = urlparse(url)
|
| 125 |
+
# Remove www., trailing slashes, and query params for comparison
|
| 126 |
+
host = parsed.netloc.replace("www.", "")
|
| 127 |
+
path = parsed.path.rstrip("/")
|
| 128 |
+
return f"{host}{path}".lower()
|
| 129 |
+
except:
|
| 130 |
+
return url.lower()
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
async def get_available_sources() -> list[str]:
|
| 134 |
+
"""Get list of available search sources based on configuration."""
|
| 135 |
+
settings = get_settings()
|
| 136 |
+
sources = ["duckduckgo", "wikipedia"] # Always available
|
| 137 |
+
|
| 138 |
+
if hasattr(settings, 'searxng_url') and settings.searxng_url:
|
| 139 |
+
sources.append("searxng")
|
| 140 |
+
if settings.tavily_api_key:
|
| 141 |
+
sources.append("tavily")
|
| 142 |
+
if settings.brave_api_key:
|
| 143 |
+
sources.append("brave")
|
| 144 |
+
|
| 145 |
+
return sources
|
app/sources/brave.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Brave Search API source.
|
| 2 |
+
|
| 3 |
+
Official Brave Search API with 2000 free queries/month.
|
| 4 |
+
https://api.search.brave.com/
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
import httpx
|
| 11 |
+
|
| 12 |
+
from app.config import get_settings
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
async def search_brave(
|
| 16 |
+
query: str,
|
| 17 |
+
max_results: int = 10,
|
| 18 |
+
freshness: str = "any",
|
| 19 |
+
country: str = "BR",
|
| 20 |
+
) -> list[dict]:
|
| 21 |
+
"""
|
| 22 |
+
Search using Brave Search API.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
query: Search query
|
| 26 |
+
max_results: Maximum results (1-20)
|
| 27 |
+
freshness: 'pd' (day), 'pw' (week), 'pm' (month), 'py' (year), or None
|
| 28 |
+
country: Country code for results
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
List of search results with title, url, content, published_date, score
|
| 32 |
+
"""
|
| 33 |
+
settings = get_settings()
|
| 34 |
+
|
| 35 |
+
if not settings.brave_api_key:
|
| 36 |
+
return []
|
| 37 |
+
|
| 38 |
+
# Map freshness to Brave format
|
| 39 |
+
freshness_map = {
|
| 40 |
+
"day": "pd",
|
| 41 |
+
"week": "pw",
|
| 42 |
+
"month": "pm",
|
| 43 |
+
"year": "py",
|
| 44 |
+
"any": None,
|
| 45 |
+
}
|
| 46 |
+
brave_freshness = freshness_map.get(freshness)
|
| 47 |
+
|
| 48 |
+
params = {
|
| 49 |
+
"q": query,
|
| 50 |
+
"count": min(max_results, 20),
|
| 51 |
+
"country": country,
|
| 52 |
+
"search_lang": "pt",
|
| 53 |
+
"text_decorations": False,
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
if brave_freshness:
|
| 57 |
+
params["freshness"] = brave_freshness
|
| 58 |
+
|
| 59 |
+
headers = {
|
| 60 |
+
"Accept": "application/json",
|
| 61 |
+
"X-Subscription-Token": settings.brave_api_key,
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
| 66 |
+
response = await client.get(
|
| 67 |
+
"https://api.search.brave.com/res/v1/web/search",
|
| 68 |
+
params=params,
|
| 69 |
+
headers=headers,
|
| 70 |
+
)
|
| 71 |
+
response.raise_for_status()
|
| 72 |
+
data = response.json()
|
| 73 |
+
|
| 74 |
+
results = []
|
| 75 |
+
web_results = data.get("web", {}).get("results", [])
|
| 76 |
+
|
| 77 |
+
for i, item in enumerate(web_results):
|
| 78 |
+
# Try to parse age/date
|
| 79 |
+
published_date = None
|
| 80 |
+
age = item.get("age")
|
| 81 |
+
if age:
|
| 82 |
+
published_date = _parse_brave_age(age)
|
| 83 |
+
|
| 84 |
+
results.append({
|
| 85 |
+
"title": item.get("title", ""),
|
| 86 |
+
"url": item.get("url", ""),
|
| 87 |
+
"content": item.get("description", ""),
|
| 88 |
+
"published_date": published_date,
|
| 89 |
+
"score": 0.8 - (i * 0.05), # Decay score by position
|
| 90 |
+
"source": "brave",
|
| 91 |
+
})
|
| 92 |
+
|
| 93 |
+
return results
|
| 94 |
+
|
| 95 |
+
except httpx.HTTPStatusError as e:
|
| 96 |
+
print(f"Brave API error: {e.response.status_code}")
|
| 97 |
+
return []
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f"Brave search error: {e}")
|
| 100 |
+
return []
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _parse_brave_age(age: str) -> Optional[datetime]:
|
| 104 |
+
"""Parse Brave's age string like '2 days ago' to datetime."""
|
| 105 |
+
import re
|
| 106 |
+
|
| 107 |
+
now = datetime.now()
|
| 108 |
+
|
| 109 |
+
patterns = [
|
| 110 |
+
(r"(\d+)\s*hour", lambda m: now.replace(hour=now.hour - int(m.group(1)))),
|
| 111 |
+
(r"(\d+)\s*day", lambda m: now.replace(day=now.day - int(m.group(1)))),
|
| 112 |
+
(r"(\d+)\s*week", lambda m: now.replace(day=now.day - int(m.group(1)) * 7)),
|
| 113 |
+
(r"(\d+)\s*month", lambda m: now.replace(month=now.month - int(m.group(1)))),
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
for pattern, func in patterns:
|
| 117 |
+
match = re.search(pattern, age, re.IGNORECASE)
|
| 118 |
+
if match:
|
| 119 |
+
try:
|
| 120 |
+
return func(match)
|
| 121 |
+
except ValueError:
|
| 122 |
+
pass
|
| 123 |
+
|
| 124 |
+
return None
|
app/sources/duckduckgo.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""DuckDuckGo search source (free fallback).
|
| 2 |
+
|
| 3 |
+
Uses the duckduckgo_search library for free web search.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from datetime import datetime, timedelta
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
import httpx
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
async def search_duckduckgo(
|
| 13 |
+
query: str,
|
| 14 |
+
max_results: int = 10,
|
| 15 |
+
region: str = "wt-wt", # Worldwide
|
| 16 |
+
) -> list[dict]:
|
| 17 |
+
"""
|
| 18 |
+
Search using DuckDuckGo (free, no API key required).
|
| 19 |
+
|
| 20 |
+
This is a fallback when other sources are unavailable.
|
| 21 |
+
Uses the HTML endpoint for basic search.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
query: Search query
|
| 25 |
+
max_results: Maximum results to return
|
| 26 |
+
region: Region code
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
List of result dicts with title, url, content
|
| 30 |
+
"""
|
| 31 |
+
try:
|
| 32 |
+
# Use DuckDuckGo HTML API (lightweight, no JS needed)
|
| 33 |
+
params = {
|
| 34 |
+
"q": query,
|
| 35 |
+
"kl": region,
|
| 36 |
+
"kp": "-1", # Safe search off
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
headers = {
|
| 40 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
| 44 |
+
# Use DuckDuckGo Lite (simpler to parse)
|
| 45 |
+
response = await client.get(
|
| 46 |
+
"https://lite.duckduckgo.com/lite/",
|
| 47 |
+
params=params,
|
| 48 |
+
headers=headers,
|
| 49 |
+
follow_redirects=True,
|
| 50 |
+
)
|
| 51 |
+
response.raise_for_status()
|
| 52 |
+
html = response.text
|
| 53 |
+
|
| 54 |
+
# Simple HTML parsing for results
|
| 55 |
+
results = parse_ddg_lite_results(html, max_results)
|
| 56 |
+
return results
|
| 57 |
+
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"DuckDuckGo search error: {e}")
|
| 60 |
+
return []
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def parse_ddg_lite_results(html: str, max_results: int) -> list[dict]:
|
| 64 |
+
"""
|
| 65 |
+
Parse DuckDuckGo Lite HTML results.
|
| 66 |
+
|
| 67 |
+
This is a simple parser for the lite version of DDG.
|
| 68 |
+
"""
|
| 69 |
+
import re
|
| 70 |
+
|
| 71 |
+
results = []
|
| 72 |
+
|
| 73 |
+
# Find all result links (class="result-link")
|
| 74 |
+
# Pattern: <a rel="nofollow" href="URL" class='result-link'>TITLE</a>
|
| 75 |
+
link_pattern = r'<a[^>]*class=["\']result-link["\'][^>]*href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>'
|
| 76 |
+
|
| 77 |
+
# Find snippets (class="result-snippet")
|
| 78 |
+
snippet_pattern = r'<td[^>]*class=["\']result-snippet["\'][^>]*>([^<]+)</td>'
|
| 79 |
+
|
| 80 |
+
links = re.findall(link_pattern, html, re.IGNORECASE)
|
| 81 |
+
snippets = re.findall(snippet_pattern, html, re.IGNORECASE)
|
| 82 |
+
|
| 83 |
+
for i, (url, title) in enumerate(links[:max_results]):
|
| 84 |
+
content = snippets[i] if i < len(snippets) else ""
|
| 85 |
+
|
| 86 |
+
# Clean up HTML entities
|
| 87 |
+
title = title.strip()
|
| 88 |
+
content = content.strip()
|
| 89 |
+
|
| 90 |
+
# Skip DuckDuckGo internal links
|
| 91 |
+
if "duckduckgo.com" in url:
|
| 92 |
+
continue
|
| 93 |
+
|
| 94 |
+
results.append({
|
| 95 |
+
"title": title,
|
| 96 |
+
"url": url,
|
| 97 |
+
"content": content,
|
| 98 |
+
"published_date": None, # DDG Lite doesn't provide dates
|
| 99 |
+
"score": 0.5, # Neutral score, will be reranked
|
| 100 |
+
"source": "duckduckgo",
|
| 101 |
+
})
|
| 102 |
+
|
| 103 |
+
return results[:max_results]
|
app/sources/images.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Image Search source.
|
| 2 |
+
|
| 3 |
+
Uses Tavily API with include_images=True for image search.
|
| 4 |
+
Falls back to Brave Image Search if Tavily unavailable.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
import httpx
|
| 10 |
+
|
| 11 |
+
from app.config import get_settings
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
async def search_images(
|
| 15 |
+
query: str,
|
| 16 |
+
max_results: int = 6,
|
| 17 |
+
) -> list[dict]:
|
| 18 |
+
"""
|
| 19 |
+
Search for images using available APIs.
|
| 20 |
+
|
| 21 |
+
Priority:
|
| 22 |
+
1. Tavily (include_images=True) - uses existing API key
|
| 23 |
+
2. Brave Image Search - fallback
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
query: Search query
|
| 27 |
+
max_results: Maximum images to return
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
List of image results with url, thumbnail, title
|
| 31 |
+
"""
|
| 32 |
+
settings = get_settings()
|
| 33 |
+
|
| 34 |
+
# Try Tavily first (same API key as main search)
|
| 35 |
+
if settings.tavily_api_key:
|
| 36 |
+
images = await _search_tavily_images(query, max_results)
|
| 37 |
+
if images:
|
| 38 |
+
return images
|
| 39 |
+
|
| 40 |
+
# Fallback to Brave
|
| 41 |
+
if settings.brave_api_key:
|
| 42 |
+
return await _search_brave_images(query, max_results)
|
| 43 |
+
|
| 44 |
+
return []
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
async def _search_tavily_images(query: str, max_results: int) -> list[dict]:
|
| 48 |
+
"""Search images using Tavily API."""
|
| 49 |
+
settings = get_settings()
|
| 50 |
+
|
| 51 |
+
payload = {
|
| 52 |
+
"api_key": settings.tavily_api_key,
|
| 53 |
+
"query": query,
|
| 54 |
+
"search_depth": "basic",
|
| 55 |
+
"max_results": 5, # We just need images, not full results
|
| 56 |
+
"include_images": True,
|
| 57 |
+
"include_image_descriptions": True,
|
| 58 |
+
"include_answer": False,
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
| 63 |
+
response = await client.post(
|
| 64 |
+
"https://api.tavily.com/search",
|
| 65 |
+
json=payload,
|
| 66 |
+
)
|
| 67 |
+
response.raise_for_status()
|
| 68 |
+
data = response.json()
|
| 69 |
+
|
| 70 |
+
results = []
|
| 71 |
+
images = data.get("images", [])
|
| 72 |
+
|
| 73 |
+
for img in images[:max_results]:
|
| 74 |
+
if isinstance(img, str):
|
| 75 |
+
# Simple URL format
|
| 76 |
+
results.append({
|
| 77 |
+
"url": img,
|
| 78 |
+
"thumbnail": img,
|
| 79 |
+
"title": "",
|
| 80 |
+
})
|
| 81 |
+
elif isinstance(img, dict):
|
| 82 |
+
# Dict format with description
|
| 83 |
+
results.append({
|
| 84 |
+
"url": img.get("url", ""),
|
| 85 |
+
"thumbnail": img.get("url", ""),
|
| 86 |
+
"title": img.get("description", ""),
|
| 87 |
+
})
|
| 88 |
+
|
| 89 |
+
return results
|
| 90 |
+
|
| 91 |
+
except Exception as e:
|
| 92 |
+
print(f"Tavily image search error: {e}")
|
| 93 |
+
return []
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
async def _search_brave_images(query: str, max_results: int) -> list[dict]:
|
| 97 |
+
"""Search images using Brave Image Search API."""
|
| 98 |
+
settings = get_settings()
|
| 99 |
+
|
| 100 |
+
params = {
|
| 101 |
+
"q": query,
|
| 102 |
+
"count": min(max_results, 20),
|
| 103 |
+
"safesearch": "moderate",
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
headers = {
|
| 107 |
+
"Accept": "application/json",
|
| 108 |
+
"X-Subscription-Token": settings.brave_api_key,
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
| 113 |
+
response = await client.get(
|
| 114 |
+
"https://api.search.brave.com/res/v1/images/search",
|
| 115 |
+
params=params,
|
| 116 |
+
headers=headers,
|
| 117 |
+
)
|
| 118 |
+
response.raise_for_status()
|
| 119 |
+
data = response.json()
|
| 120 |
+
|
| 121 |
+
results = []
|
| 122 |
+
images = data.get("results", [])
|
| 123 |
+
|
| 124 |
+
for img in images[:max_results]:
|
| 125 |
+
results.append({
|
| 126 |
+
"url": img.get("properties", {}).get("url", ""),
|
| 127 |
+
"thumbnail": img.get("thumbnail", {}).get("src", ""),
|
| 128 |
+
"title": img.get("title", ""),
|
| 129 |
+
})
|
| 130 |
+
|
| 131 |
+
return results
|
| 132 |
+
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f"Brave image search error: {e}")
|
| 135 |
+
return []
|
app/sources/scraper.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Content Scraper.
|
| 2 |
+
|
| 3 |
+
Extracts clean text content from URLs for deeper analysis.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
from typing import Optional
|
| 8 |
+
|
| 9 |
+
import httpx
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
async def scrape_url_content(
|
| 13 |
+
url: str,
|
| 14 |
+
max_chars: int = 5000,
|
| 15 |
+
timeout: float = 10.0,
|
| 16 |
+
) -> Optional[str]:
|
| 17 |
+
"""
|
| 18 |
+
Scrape and extract clean text content from a URL.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
url: URL to scrape
|
| 22 |
+
max_chars: Maximum characters to return
|
| 23 |
+
timeout: Request timeout in seconds
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
Extracted text content or None if failed
|
| 27 |
+
"""
|
| 28 |
+
try:
|
| 29 |
+
headers = {
|
| 30 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 31 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 32 |
+
"Accept-Language": "en-US,en;q=0.5",
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
|
| 36 |
+
response = await client.get(url, headers=headers)
|
| 37 |
+
response.raise_for_status()
|
| 38 |
+
html = response.text
|
| 39 |
+
|
| 40 |
+
# Try trafilatura first (best quality)
|
| 41 |
+
try:
|
| 42 |
+
import trafilatura
|
| 43 |
+
text = trafilatura.extract(
|
| 44 |
+
html,
|
| 45 |
+
include_comments=False,
|
| 46 |
+
include_tables=True,
|
| 47 |
+
no_fallback=False,
|
| 48 |
+
)
|
| 49 |
+
if text:
|
| 50 |
+
return text[:max_chars]
|
| 51 |
+
except ImportError:
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
# Fallback: simple HTML extraction
|
| 55 |
+
text = _simple_extract(html)
|
| 56 |
+
return text[:max_chars] if text else None
|
| 57 |
+
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"Scrape error for {url}: {e}")
|
| 60 |
+
return None
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _simple_extract(html: str) -> str:
|
| 64 |
+
"""Simple HTML text extraction without external libs."""
|
| 65 |
+
import re
|
| 66 |
+
|
| 67 |
+
# Remove script and style tags
|
| 68 |
+
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
| 69 |
+
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
| 70 |
+
html = re.sub(r'<head[^>]*>.*?</head>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
| 71 |
+
html = re.sub(r'<nav[^>]*>.*?</nav>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
| 72 |
+
html = re.sub(r'<footer[^>]*>.*?</footer>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
| 73 |
+
|
| 74 |
+
# Remove all HTML tags
|
| 75 |
+
text = re.sub(r'<[^>]+>', ' ', html)
|
| 76 |
+
|
| 77 |
+
# Clean up whitespace
|
| 78 |
+
text = re.sub(r'\s+', ' ', text)
|
| 79 |
+
text = text.strip()
|
| 80 |
+
|
| 81 |
+
return text
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
async def scrape_multiple_urls(
|
| 85 |
+
urls: list[str],
|
| 86 |
+
max_chars_per_url: int = 3000,
|
| 87 |
+
max_concurrent: int = 5,
|
| 88 |
+
) -> dict[str, Optional[str]]:
|
| 89 |
+
"""
|
| 90 |
+
Scrape multiple URLs concurrently.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
urls: List of URLs to scrape
|
| 94 |
+
max_chars_per_url: Max chars per URL
|
| 95 |
+
max_concurrent: Max concurrent requests
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Dict mapping URL to extracted content (or None if failed)
|
| 99 |
+
"""
|
| 100 |
+
semaphore = asyncio.Semaphore(max_concurrent)
|
| 101 |
+
|
| 102 |
+
async def scrape_with_semaphore(url: str) -> tuple[str, Optional[str]]:
|
| 103 |
+
async with semaphore:
|
| 104 |
+
content = await scrape_url_content(url, max_chars_per_url)
|
| 105 |
+
return url, content
|
| 106 |
+
|
| 107 |
+
tasks = [scrape_with_semaphore(url) for url in urls]
|
| 108 |
+
results = await asyncio.gather(*tasks)
|
| 109 |
+
|
| 110 |
+
return dict(results)
|
app/sources/searxng.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SearXNG meta-search source.
|
| 2 |
+
|
| 3 |
+
Uses a self-hosted SearXNG instance for comprehensive search results
|
| 4 |
+
from multiple engines (Google, Bing, DDG, etc.) without API costs.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Optional
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
import httpx
|
| 11 |
+
|
| 12 |
+
from app.config import get_settings
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# Default SearXNG instance (your HF Space)
|
| 16 |
+
DEFAULT_SEARXNG_URL = "https://madras1-searxng-space.hf.space"
|
| 17 |
+
|
| 18 |
+
# No fallbacks - use only your instance
|
| 19 |
+
FALLBACK_INSTANCES = []
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
async def search_searxng(
|
| 23 |
+
query: str,
|
| 24 |
+
max_results: int = 50,
|
| 25 |
+
categories: Optional[list[str]] = None,
|
| 26 |
+
engines: Optional[list[str]] = None,
|
| 27 |
+
language: str = "all",
|
| 28 |
+
time_range: Optional[str] = None,
|
| 29 |
+
searxng_url: Optional[str] = None,
|
| 30 |
+
) -> list[dict]:
|
| 31 |
+
"""
|
| 32 |
+
Search using SearXNG meta-search engine.
|
| 33 |
+
|
| 34 |
+
Returns many more results than API-based sources, making
|
| 35 |
+
embedding-based reranking valuable.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
query: Search query
|
| 39 |
+
max_results: Maximum results to return (can be 50-100+)
|
| 40 |
+
categories: Search categories (general, news, science, etc.)
|
| 41 |
+
engines: Specific engines to use (google, bing, etc.)
|
| 42 |
+
language: Language code (en, pt, all)
|
| 43 |
+
time_range: Time filter (day, week, month, year)
|
| 44 |
+
searxng_url: Custom SearXNG instance URL
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
List of search results with title, url, content, source
|
| 48 |
+
"""
|
| 49 |
+
settings = get_settings()
|
| 50 |
+
|
| 51 |
+
# Build instance list
|
| 52 |
+
instances = []
|
| 53 |
+
if searxng_url:
|
| 54 |
+
instances.append(searxng_url)
|
| 55 |
+
if hasattr(settings, 'searxng_url') and settings.searxng_url:
|
| 56 |
+
instances.append(settings.searxng_url)
|
| 57 |
+
instances.append(DEFAULT_SEARXNG_URL)
|
| 58 |
+
instances.extend(FALLBACK_INSTANCES)
|
| 59 |
+
|
| 60 |
+
# Build params
|
| 61 |
+
params = {
|
| 62 |
+
"q": query,
|
| 63 |
+
"format": "json",
|
| 64 |
+
"language": language,
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
if categories:
|
| 68 |
+
params["categories"] = ",".join(categories)
|
| 69 |
+
if engines:
|
| 70 |
+
params["engines"] = ",".join(engines)
|
| 71 |
+
if time_range:
|
| 72 |
+
params["time_range"] = time_range
|
| 73 |
+
|
| 74 |
+
# Try each instance
|
| 75 |
+
for instance in instances:
|
| 76 |
+
try:
|
| 77 |
+
results = await _fetch_searxng(instance, params, max_results)
|
| 78 |
+
if results:
|
| 79 |
+
return results
|
| 80 |
+
except Exception as e:
|
| 81 |
+
print(f"SearXNG instance {instance} failed: {e}")
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
return []
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
async def _fetch_searxng(
|
| 88 |
+
instance_url: str,
|
| 89 |
+
params: dict,
|
| 90 |
+
max_results: int,
|
| 91 |
+
) -> list[dict]:
|
| 92 |
+
"""Fetch results from a SearXNG instance."""
|
| 93 |
+
|
| 94 |
+
# Use browser-like headers to avoid blocks
|
| 95 |
+
headers = {
|
| 96 |
+
"Accept": "application/json",
|
| 97 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 98 |
+
"Accept-Language": "en-US,en;q=0.9",
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
async with httpx.AsyncClient(timeout=15.0) as client:
|
| 102 |
+
response = await client.get(
|
| 103 |
+
f"{instance_url.rstrip('/')}/search",
|
| 104 |
+
params=params,
|
| 105 |
+
headers=headers,
|
| 106 |
+
)
|
| 107 |
+
response.raise_for_status()
|
| 108 |
+
data = response.json()
|
| 109 |
+
|
| 110 |
+
results = []
|
| 111 |
+
for item in data.get("results", [])[:max_results]:
|
| 112 |
+
result = {
|
| 113 |
+
"title": item.get("title", ""),
|
| 114 |
+
"url": item.get("url", ""),
|
| 115 |
+
"content": item.get("content", ""),
|
| 116 |
+
"source": f"searxng:{item.get('engine', 'unknown')}",
|
| 117 |
+
"score": _calculate_score(item),
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
# Extract date if available
|
| 121 |
+
published_date = item.get("publishedDate")
|
| 122 |
+
if published_date:
|
| 123 |
+
result["published_date"] = published_date
|
| 124 |
+
|
| 125 |
+
results.append(result)
|
| 126 |
+
|
| 127 |
+
return results
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _calculate_score(item: dict) -> float:
|
| 131 |
+
"""Calculate initial score based on position and engine."""
|
| 132 |
+
# Base score from position (if available)
|
| 133 |
+
position = item.get("position", 10)
|
| 134 |
+
position_score = max(0.3, 1.0 - (position * 0.05))
|
| 135 |
+
|
| 136 |
+
# Bonus for certain engines
|
| 137 |
+
engine = item.get("engine", "").lower()
|
| 138 |
+
engine_bonus = {
|
| 139 |
+
"google": 0.1,
|
| 140 |
+
"bing": 0.05,
|
| 141 |
+
"duckduckgo": 0.05,
|
| 142 |
+
"wikipedia": 0.1,
|
| 143 |
+
"arxiv": 0.15,
|
| 144 |
+
"google scholar": 0.15,
|
| 145 |
+
}.get(engine, 0)
|
| 146 |
+
|
| 147 |
+
return min(1.0, position_score + engine_bonus)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
async def get_searxng_engines(searxng_url: Optional[str] = None) -> list[str]:
|
| 151 |
+
"""Get list of available engines from SearXNG instance."""
|
| 152 |
+
url = searxng_url or DEFAULT_SEARXNG_URL
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
| 156 |
+
response = await client.get(f"{url}/config")
|
| 157 |
+
response.raise_for_status()
|
| 158 |
+
data = response.json()
|
| 159 |
+
|
| 160 |
+
return [
|
| 161 |
+
engine["name"]
|
| 162 |
+
for engine in data.get("engines", [])
|
| 163 |
+
if not engine.get("disabled", False)
|
| 164 |
+
]
|
| 165 |
+
except Exception:
|
| 166 |
+
return []
|
app/sources/tavily.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tavily search source integration.
|
| 2 |
+
|
| 3 |
+
Tavily provides high-quality, AI-optimized search results.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from typing import Literal, Optional
|
| 8 |
+
|
| 9 |
+
import httpx
|
| 10 |
+
|
| 11 |
+
from app.config import get_settings
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
async def search_tavily(
|
| 15 |
+
query: str,
|
| 16 |
+
max_results: int = 10,
|
| 17 |
+
freshness: Literal["day", "week", "month", "year", "any"] = "any",
|
| 18 |
+
include_domains: Optional[list[str]] = None,
|
| 19 |
+
exclude_domains: Optional[list[str]] = None,
|
| 20 |
+
search_depth: Literal["basic", "advanced"] = "advanced",
|
| 21 |
+
) -> list[dict]:
|
| 22 |
+
"""
|
| 23 |
+
Search using Tavily API.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
query: Search query
|
| 27 |
+
max_results: Maximum results to return
|
| 28 |
+
freshness: Filter by recency
|
| 29 |
+
include_domains: Only include these domains
|
| 30 |
+
exclude_domains: Exclude these domains
|
| 31 |
+
search_depth: "basic" (fast) or "advanced" (thorough)
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
List of result dicts with title, url, content, published_date, score
|
| 35 |
+
"""
|
| 36 |
+
settings = get_settings()
|
| 37 |
+
|
| 38 |
+
if not settings.tavily_api_key:
|
| 39 |
+
return []
|
| 40 |
+
|
| 41 |
+
# Map freshness to Tavily's days parameter
|
| 42 |
+
days_map = {
|
| 43 |
+
"day": 1,
|
| 44 |
+
"week": 7,
|
| 45 |
+
"month": 30,
|
| 46 |
+
"year": 365,
|
| 47 |
+
"any": None,
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
payload = {
|
| 51 |
+
"api_key": settings.tavily_api_key,
|
| 52 |
+
"query": query,
|
| 53 |
+
"search_depth": search_depth,
|
| 54 |
+
"max_results": max_results,
|
| 55 |
+
"include_answer": False,
|
| 56 |
+
"include_raw_content": False,
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# Add optional filters
|
| 60 |
+
if days_map.get(freshness):
|
| 61 |
+
payload["days"] = days_map[freshness]
|
| 62 |
+
|
| 63 |
+
if include_domains:
|
| 64 |
+
payload["include_domains"] = include_domains
|
| 65 |
+
|
| 66 |
+
if exclude_domains:
|
| 67 |
+
payload["exclude_domains"] = exclude_domains
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
| 71 |
+
response = await client.post(
|
| 72 |
+
"https://api.tavily.com/search",
|
| 73 |
+
json=payload,
|
| 74 |
+
)
|
| 75 |
+
response.raise_for_status()
|
| 76 |
+
data = response.json()
|
| 77 |
+
|
| 78 |
+
results = []
|
| 79 |
+
for item in data.get("results", []):
|
| 80 |
+
# Parse published date if available
|
| 81 |
+
pub_date = None
|
| 82 |
+
if "published_date" in item and item["published_date"]:
|
| 83 |
+
try:
|
| 84 |
+
pub_date = datetime.fromisoformat(
|
| 85 |
+
item["published_date"].replace("Z", "+00:00")
|
| 86 |
+
)
|
| 87 |
+
except (ValueError, TypeError):
|
| 88 |
+
pass
|
| 89 |
+
|
| 90 |
+
results.append({
|
| 91 |
+
"title": item.get("title", ""),
|
| 92 |
+
"url": item.get("url", ""),
|
| 93 |
+
"content": item.get("content", ""),
|
| 94 |
+
"published_date": pub_date,
|
| 95 |
+
"score": item.get("score", 0.5),
|
| 96 |
+
"source": "tavily",
|
| 97 |
+
})
|
| 98 |
+
|
| 99 |
+
return results
|
| 100 |
+
|
| 101 |
+
except httpx.HTTPError as e:
|
| 102 |
+
print(f"Tavily search error: {e}")
|
| 103 |
+
return []
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"Tavily unexpected error: {e}")
|
| 106 |
+
return []
|
app/sources/wikipedia.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Wikipedia Search source.
|
| 2 |
+
|
| 3 |
+
Uses Wikipedia's free API for background/context information.
|
| 4 |
+
No API key required, unlimited usage.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
import httpx
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
async def search_wikipedia(
|
| 14 |
+
query: str,
|
| 15 |
+
max_results: int = 5,
|
| 16 |
+
language: str = "pt",
|
| 17 |
+
) -> list[dict]:
|
| 18 |
+
"""
|
| 19 |
+
Search Wikipedia for relevant articles.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
query: Search query
|
| 23 |
+
max_results: Maximum results (1-10)
|
| 24 |
+
language: Wikipedia language code (pt, en, es, etc)
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
List of search results with title, url, content, score
|
| 28 |
+
"""
|
| 29 |
+
base_url = f"https://{language}.wikipedia.org/w/api.php"
|
| 30 |
+
|
| 31 |
+
# First, search for pages
|
| 32 |
+
search_params = {
|
| 33 |
+
"action": "query",
|
| 34 |
+
"list": "search",
|
| 35 |
+
"srsearch": query,
|
| 36 |
+
"srlimit": min(max_results, 10),
|
| 37 |
+
"format": "json",
|
| 38 |
+
"utf8": 1,
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
| 43 |
+
# Search for articles
|
| 44 |
+
response = await client.get(base_url, params=search_params)
|
| 45 |
+
response.raise_for_status()
|
| 46 |
+
search_data = response.json()
|
| 47 |
+
|
| 48 |
+
results = []
|
| 49 |
+
search_results = search_data.get("query", {}).get("search", [])
|
| 50 |
+
|
| 51 |
+
for i, item in enumerate(search_results):
|
| 52 |
+
title = item.get("title", "")
|
| 53 |
+
page_id = item.get("pageid")
|
| 54 |
+
snippet = item.get("snippet", "")
|
| 55 |
+
|
| 56 |
+
# Clean HTML from snippet
|
| 57 |
+
snippet = _clean_html(snippet)
|
| 58 |
+
|
| 59 |
+
# Get extract for better content
|
| 60 |
+
extract = await _get_page_extract(client, base_url, page_id)
|
| 61 |
+
|
| 62 |
+
results.append({
|
| 63 |
+
"title": f"Wikipedia: {title}",
|
| 64 |
+
"url": f"https://{language}.wikipedia.org/wiki/{title.replace(' ', '_')}",
|
| 65 |
+
"content": extract or snippet,
|
| 66 |
+
"published_date": None, # Wikipedia doesn't provide this easily
|
| 67 |
+
"score": 0.7 - (i * 0.05), # Lower base score (reference material)
|
| 68 |
+
"source": "wikipedia",
|
| 69 |
+
})
|
| 70 |
+
|
| 71 |
+
return results
|
| 72 |
+
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"Wikipedia search error: {e}")
|
| 75 |
+
return []
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
async def _get_page_extract(
|
| 79 |
+
client: httpx.AsyncClient,
|
| 80 |
+
base_url: str,
|
| 81 |
+
page_id: int,
|
| 82 |
+
) -> Optional[str]:
|
| 83 |
+
"""Get a short extract from a Wikipedia page."""
|
| 84 |
+
params = {
|
| 85 |
+
"action": "query",
|
| 86 |
+
"pageids": page_id,
|
| 87 |
+
"prop": "extracts",
|
| 88 |
+
"exintro": True,
|
| 89 |
+
"explaintext": True,
|
| 90 |
+
"exsentences": 5,
|
| 91 |
+
"format": "json",
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
response = await client.get(base_url, params=params)
|
| 96 |
+
data = response.json()
|
| 97 |
+
pages = data.get("query", {}).get("pages", {})
|
| 98 |
+
page = pages.get(str(page_id), {})
|
| 99 |
+
return page.get("extract", "")
|
| 100 |
+
except:
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _clean_html(text: str) -> str:
|
| 105 |
+
"""Remove HTML tags from text."""
|
| 106 |
+
import re
|
| 107 |
+
clean = re.sub(r'<[^>]+>', '', text)
|
| 108 |
+
return clean.strip()
|
app/temporal/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Temporal intelligence module."""
|
app/temporal/freshness_scorer.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Freshness scoring for search results.
|
| 2 |
+
|
| 3 |
+
Calculates how fresh/recent content is using exponential decay.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import math
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
from app.config import get_settings
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def calculate_freshness_score(
|
| 14 |
+
published_date: Optional[datetime | str] = None,
|
| 15 |
+
half_life_days: Optional[int] = None,
|
| 16 |
+
) -> float:
|
| 17 |
+
"""
|
| 18 |
+
Calculate freshness score using exponential decay.
|
| 19 |
+
|
| 20 |
+
The score decays exponentially based on content age:
|
| 21 |
+
- Just published: ~1.0
|
| 22 |
+
- half_life_days old: ~0.5
|
| 23 |
+
- 2x half_life_days old: ~0.25
|
| 24 |
+
- Very old: approaches 0
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
published_date: When the content was published
|
| 28 |
+
half_life_days: Days until score halves (default from settings)
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Freshness score between 0.0 and 1.0
|
| 32 |
+
"""
|
| 33 |
+
if published_date is None:
|
| 34 |
+
# Unknown date gets neutral score
|
| 35 |
+
return 0.5
|
| 36 |
+
|
| 37 |
+
settings = get_settings()
|
| 38 |
+
if half_life_days is None:
|
| 39 |
+
half_life_days = settings.default_freshness_half_life
|
| 40 |
+
|
| 41 |
+
# Parse string dates if needed
|
| 42 |
+
if isinstance(published_date, str):
|
| 43 |
+
try:
|
| 44 |
+
# Try common formats
|
| 45 |
+
for fmt in ["%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%SZ"]:
|
| 46 |
+
try:
|
| 47 |
+
published_date = datetime.strptime(published_date, fmt)
|
| 48 |
+
break
|
| 49 |
+
except ValueError:
|
| 50 |
+
continue
|
| 51 |
+
else:
|
| 52 |
+
return 0.5 # Couldn't parse, neutral score
|
| 53 |
+
except Exception:
|
| 54 |
+
return 0.5
|
| 55 |
+
|
| 56 |
+
# Calculate age in days
|
| 57 |
+
now = datetime.now()
|
| 58 |
+
if published_date > now:
|
| 59 |
+
# Future date (probably an error), treat as very fresh
|
| 60 |
+
return 1.0
|
| 61 |
+
|
| 62 |
+
age_days = (now - published_date).days
|
| 63 |
+
|
| 64 |
+
# Exponential decay: score = e^(-λt) where λ = ln(2) / half_life
|
| 65 |
+
decay_constant = 0.693147 / half_life_days # ln(2)
|
| 66 |
+
score = math.exp(-decay_constant * age_days)
|
| 67 |
+
|
| 68 |
+
# Ensure score is in valid range
|
| 69 |
+
return max(0.01, min(1.0, score))
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def get_freshness_label(score: float) -> str:
|
| 73 |
+
"""
|
| 74 |
+
Get a human-readable label for a freshness score.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
score: Freshness score 0-1
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Label like "Very Fresh", "Recent", "Dated", etc.
|
| 81 |
+
"""
|
| 82 |
+
if score >= 0.9:
|
| 83 |
+
return "Very Fresh"
|
| 84 |
+
elif score >= 0.7:
|
| 85 |
+
return "Fresh"
|
| 86 |
+
elif score >= 0.5:
|
| 87 |
+
return "Recent"
|
| 88 |
+
elif score >= 0.3:
|
| 89 |
+
return "Dated"
|
| 90 |
+
elif score >= 0.1:
|
| 91 |
+
return "Old"
|
| 92 |
+
else:
|
| 93 |
+
return "Very Old"
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def adjust_score_by_freshness(
|
| 97 |
+
base_score: float,
|
| 98 |
+
freshness_score: float,
|
| 99 |
+
temporal_urgency: float,
|
| 100 |
+
) -> float:
|
| 101 |
+
"""
|
| 102 |
+
Adjust a result's relevance score based on freshness.
|
| 103 |
+
|
| 104 |
+
When temporal_urgency is high, freshness matters more.
|
| 105 |
+
When temporal_urgency is low, freshness matters less.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
base_score: Original relevance score (0-1)
|
| 109 |
+
freshness_score: How fresh the content is (0-1)
|
| 110 |
+
temporal_urgency: How important freshness is for this query (0-1)
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
Adjusted score (0-1)
|
| 114 |
+
"""
|
| 115 |
+
# Weight freshness by temporal urgency
|
| 116 |
+
freshness_weight = temporal_urgency * 0.4 # Max 40% impact from freshness
|
| 117 |
+
base_weight = 1.0 - freshness_weight
|
| 118 |
+
|
| 119 |
+
adjusted = (base_score * base_weight) + (freshness_score * freshness_weight)
|
| 120 |
+
|
| 121 |
+
return max(0.0, min(1.0, adjusted))
|
app/temporal/intent_detector.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Temporal intent detection for search queries.
|
| 2 |
+
|
| 3 |
+
Analyzes queries to determine if they require fresh/current information
|
| 4 |
+
or if historical information is acceptable.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import Literal
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _get_dynamic_years() -> set[str]:
|
| 13 |
+
"""Get current and previous year dynamically."""
|
| 14 |
+
current_year = datetime.now().year
|
| 15 |
+
return {str(current_year), str(current_year - 1)}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Keywords that strongly indicate need for current information
|
| 19 |
+
FRESHNESS_KEYWORDS = {
|
| 20 |
+
# English
|
| 21 |
+
"latest", "newest", "recent", "current", "today", "now",
|
| 22 |
+
"this week", "this month", "this year", "breaking",
|
| 23 |
+
"update", "updates", "new", "just", "announced",
|
| 24 |
+
*_get_dynamic_years(), # Dynamic years
|
| 25 |
+
# Portuguese
|
| 26 |
+
"último", "últimos", "recente", "atual", "hoje", "agora",
|
| 27 |
+
"essa semana", "esse mês", "esse ano", "novidade",
|
| 28 |
+
"atualização", "novo", "novos", "anunciado",
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
# Keywords that indicate historical queries (less urgent freshness)
|
| 32 |
+
HISTORICAL_KEYWORDS = {
|
| 33 |
+
"history", "historical", "origin", "origins", "invented",
|
| 34 |
+
"founded", "first", "original", "classic", "traditional",
|
| 35 |
+
"história", "histórico", "origem", "inventado", "fundado",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Entity types that typically require fresh information
|
| 39 |
+
FRESH_ENTITY_PATTERNS = [
|
| 40 |
+
r"\b(?:price|prices|stock|stocks|market)\b", # Financial
|
| 41 |
+
r"\b(?:weather|forecast|temperature)\b", # Weather
|
| 42 |
+
r"\b(?:news|headlines|breaking)\b", # News
|
| 43 |
+
r"\b(?:score|scores|game|match|vs)\b", # Sports
|
| 44 |
+
r"\b(?:version|release|update|patch)\b", # Software
|
| 45 |
+
r"\b(?:gpt-?\d|claude|gemini|llama|mistral)\b", # AI models
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def detect_temporal_intent(
|
| 50 |
+
query: str,
|
| 51 |
+
) -> tuple[Literal["current", "historical", "neutral"], float]:
|
| 52 |
+
"""
|
| 53 |
+
Detect the temporal intent of a search query.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
query: The search query string
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Tuple of (intent, urgency) where:
|
| 60 |
+
- intent: "current", "historical", or "neutral"
|
| 61 |
+
- urgency: float 0-1 indicating how important freshness is
|
| 62 |
+
"""
|
| 63 |
+
query_lower = query.lower()
|
| 64 |
+
|
| 65 |
+
# Count freshness indicators
|
| 66 |
+
freshness_score = 0.0
|
| 67 |
+
historical_score = 0.0
|
| 68 |
+
|
| 69 |
+
# Check for freshness keywords
|
| 70 |
+
for keyword in FRESHNESS_KEYWORDS:
|
| 71 |
+
if keyword in query_lower:
|
| 72 |
+
freshness_score += 0.3
|
| 73 |
+
|
| 74 |
+
# Check for historical keywords
|
| 75 |
+
for keyword in HISTORICAL_KEYWORDS:
|
| 76 |
+
if keyword in query_lower:
|
| 77 |
+
historical_score += 0.3
|
| 78 |
+
|
| 79 |
+
# Check for fresh entity patterns
|
| 80 |
+
for pattern in FRESH_ENTITY_PATTERNS:
|
| 81 |
+
if re.search(pattern, query_lower):
|
| 82 |
+
freshness_score += 0.2
|
| 83 |
+
|
| 84 |
+
# Question words that often imply current info needed
|
| 85 |
+
if re.search(r"\b(?:what is|who is|how to|where is)\b", query_lower):
|
| 86 |
+
freshness_score += 0.1
|
| 87 |
+
|
| 88 |
+
# Superlatives often need current info
|
| 89 |
+
if re.search(r"\b(?:best|top|most|fastest|cheapest)\b", query_lower):
|
| 90 |
+
freshness_score += 0.15
|
| 91 |
+
|
| 92 |
+
# Normalize scores
|
| 93 |
+
freshness_score = min(freshness_score, 1.0)
|
| 94 |
+
historical_score = min(historical_score, 1.0)
|
| 95 |
+
|
| 96 |
+
# Determine intent
|
| 97 |
+
if freshness_score > historical_score and freshness_score > 0.2:
|
| 98 |
+
intent = "current"
|
| 99 |
+
urgency = min(0.3 + freshness_score, 1.0)
|
| 100 |
+
elif historical_score > freshness_score and historical_score > 0.2:
|
| 101 |
+
intent = "historical"
|
| 102 |
+
urgency = max(0.2 - historical_score * 0.1, 0.1)
|
| 103 |
+
else:
|
| 104 |
+
intent = "neutral"
|
| 105 |
+
urgency = 0.5
|
| 106 |
+
|
| 107 |
+
return intent, urgency
|