Madras1 commited on
Commit
2bd2625
·
verified ·
1 Parent(s): 234d630

Upload 48 files

Browse files
.env.example CHANGED
@@ -2,9 +2,12 @@
2
  GROQ_API_KEY=gsk_your_groq_key
3
  OPENROUTER_API_KEY=sk-or-your_openrouter_key
4
 
5
- # Search Sources
6
- TAVILY_API_KEY=tvly-your_tavily_key
7
- SERPER_API_KEY=your_serper_key # Optional
 
 
 
8
 
9
  # Configuration
10
  LLM_PROVIDER=groq # or "openrouter"
 
2
  GROQ_API_KEY=gsk_your_groq_key
3
  OPENROUTER_API_KEY=sk-or-your_openrouter_key
4
 
5
+ # SearXNG (self-hosted, optional)
6
+ SEARXNG_URL=https://your-searxng.hf.space
7
+
8
+ # Search APIs
9
+ TAVILY_API_KEY=tvly-xxxxx
10
+ BRAVE_API_KEY=BSAxxxxx
11
 
12
  # Configuration
13
  LLM_PROVIDER=groq # or "openrouter"
app/api/routes/search.py CHANGED
@@ -348,3 +348,73 @@ async def image_search(request: Request, query: str, max_results: int = 6):
348
 
349
  return {"query": query, "images": images}
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
 
349
  return {"query": query, "images": images}
350
 
351
+
352
+ # === EXPERIMENTAL: SearXNG-only search ===
353
+
354
+ @router.post(
355
+ "/search/searxng",
356
+ summary="[EXPERIMENTAL] Search using only SearXNG",
357
+ description="Test endpoint: uses SearXNG meta-search with embedding reranking. Returns 50+ results.",
358
+ )
359
+ @limiter.limit("20/minute")
360
+ async def searxng_search(request: Request, body: SearchRequest):
361
+ """
362
+ EXPERIMENTAL: Search using only SearXNG.
363
+
364
+ This endpoint bypasses paid APIs and uses SearXNG public instances.
365
+ With 50+ results, embedding-based reranking is actually useful here.
366
+ """
367
+ import json
368
+ from app.sources.searxng import search_searxng
369
+ from app.reranking.embeddings import compute_bi_encoder_scores
370
+
371
+ async def event_generator():
372
+ try:
373
+ # Step 1: Search SearXNG
374
+ yield f"data: {json.dumps({'type': 'status', 'message': 'Searching SearXNG...'})}\\n\\n"
375
+
376
+ time_range = {"day": "day", "week": "week", "month": "month"}.get(body.freshness)
377
+ raw_results = await search_searxng(
378
+ query=body.query,
379
+ max_results=50,
380
+ time_range=time_range,
381
+ )
382
+
383
+ if not raw_results:
384
+ yield f"data: {json.dumps({'type': 'error', 'message': 'No results from SearXNG'})}\\n\\n"
385
+ return
386
+
387
+ yield f"data: {json.dumps({'type': 'searxng_complete', 'count': len(raw_results)})}\\n\\n"
388
+
389
+ # Step 2: Rerank with embeddings
390
+ yield f"data: {json.dumps({'type': 'status', 'message': 'Reranking with embeddings...'})}\\n\\n"
391
+
392
+ docs = [f"{r.get('title', '')}. {r.get('content', '')[:500]}" for r in raw_results]
393
+ scores = compute_bi_encoder_scores(body.query, docs)
394
+
395
+ for i, result in enumerate(raw_results):
396
+ result["embedding_score"] = scores[i]
397
+ orig_score = result.get("score", 0.5)
398
+ result["score"] = (scores[i] * 0.7) + (orig_score * 0.3)
399
+
400
+ raw_results.sort(key=lambda x: x["score"], reverse=True)
401
+ final_results = raw_results[:body.max_results]
402
+
403
+ # Step 3: Return results
404
+ yield f"data: {json.dumps({'type': 'results', 'results': [{'title': r.get('title'), 'url': r.get('url'), 'content': r.get('content', '')[:300], 'score': round(r.get('score', 0), 3), 'source': r.get('source')} for r in final_results]})}\\n\\n"
405
+
406
+ yield f"data: {json.dumps({'type': 'done', 'total_raw': len(raw_results), 'returned': len(final_results)})}\\n\\n"
407
+
408
+ except Exception as e:
409
+ yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\\n\\n"
410
+
411
+ return StreamingResponse(
412
+ event_generator(),
413
+ media_type="text/event-stream",
414
+ headers={
415
+ "Cache-Control": "no-cache",
416
+ "Connection": "keep-alive",
417
+ },
418
+ )
419
+
420
+
app/config.py CHANGED
@@ -18,6 +18,9 @@ class Settings(BaseSettings):
18
  # API Keys - Search Sources
19
  tavily_api_key: str = ""
20
  brave_api_key: str = "" # 2000 free/month
 
 
 
21
  serper_api_key: str | None = None
22
 
23
  # API Keys - LLM Providers
 
18
  # API Keys - Search Sources
19
  tavily_api_key: str = ""
20
  brave_api_key: str = "" # 2000 free/month
21
+
22
+ # SearXNG (self-hosted meta-search)
23
+ searxng_url: str = "" # e.g. https://madras1-searxng.hf.space
24
  serper_api_key: str | None = None
25
 
26
  # API Keys - LLM Providers
app/sources/searxng.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SearXNG meta-search source.
2
+
3
+ Uses a self-hosted SearXNG instance for comprehensive search results
4
+ from multiple engines (Google, Bing, DDG, etc.) without API costs.
5
+ """
6
+
7
+ from typing import Optional
8
+ from datetime import datetime
9
+
10
+ import httpx
11
+
12
+ from app.config import get_settings
13
+
14
+
15
+ # Default SearXNG instance (your HF Space)
16
+ DEFAULT_SEARXNG_URL = "https://madras1-searxng.hf.space"
17
+
18
+ # Fallback public instances (if your instance is down)
19
+ FALLBACK_INSTANCES = [
20
+ "https://searx.be",
21
+ "https://search.sapti.me",
22
+ "https://searx.tiekoetter.com",
23
+ ]
24
+
25
+
26
+ async def search_searxng(
27
+ query: str,
28
+ max_results: int = 50,
29
+ categories: Optional[list[str]] = None,
30
+ engines: Optional[list[str]] = None,
31
+ language: str = "all",
32
+ time_range: Optional[str] = None,
33
+ searxng_url: Optional[str] = None,
34
+ ) -> list[dict]:
35
+ """
36
+ Search using SearXNG meta-search engine.
37
+
38
+ Returns many more results than API-based sources, making
39
+ embedding-based reranking valuable.
40
+
41
+ Args:
42
+ query: Search query
43
+ max_results: Maximum results to return (can be 50-100+)
44
+ categories: Search categories (general, news, science, etc.)
45
+ engines: Specific engines to use (google, bing, etc.)
46
+ language: Language code (en, pt, all)
47
+ time_range: Time filter (day, week, month, year)
48
+ searxng_url: Custom SearXNG instance URL
49
+
50
+ Returns:
51
+ List of search results with title, url, content, source
52
+ """
53
+ settings = get_settings()
54
+
55
+ # Build instance list
56
+ instances = []
57
+ if searxng_url:
58
+ instances.append(searxng_url)
59
+ if hasattr(settings, 'searxng_url') and settings.searxng_url:
60
+ instances.append(settings.searxng_url)
61
+ instances.append(DEFAULT_SEARXNG_URL)
62
+ instances.extend(FALLBACK_INSTANCES)
63
+
64
+ # Build params
65
+ params = {
66
+ "q": query,
67
+ "format": "json",
68
+ "language": language,
69
+ }
70
+
71
+ if categories:
72
+ params["categories"] = ",".join(categories)
73
+ if engines:
74
+ params["engines"] = ",".join(engines)
75
+ if time_range:
76
+ params["time_range"] = time_range
77
+
78
+ # Try each instance
79
+ for instance in instances:
80
+ try:
81
+ results = await _fetch_searxng(instance, params, max_results)
82
+ if results:
83
+ return results
84
+ except Exception as e:
85
+ print(f"SearXNG instance {instance} failed: {e}")
86
+ continue
87
+
88
+ return []
89
+
90
+
91
+ async def _fetch_searxng(
92
+ instance_url: str,
93
+ params: dict,
94
+ max_results: int,
95
+ ) -> list[dict]:
96
+ """Fetch results from a SearXNG instance."""
97
+
98
+ async with httpx.AsyncClient(timeout=15.0) as client:
99
+ response = await client.get(
100
+ f"{instance_url.rstrip('/')}/search",
101
+ params=params,
102
+ headers={"Accept": "application/json"},
103
+ )
104
+ response.raise_for_status()
105
+ data = response.json()
106
+
107
+ results = []
108
+ for item in data.get("results", [])[:max_results]:
109
+ result = {
110
+ "title": item.get("title", ""),
111
+ "url": item.get("url", ""),
112
+ "content": item.get("content", ""),
113
+ "source": f"searxng:{item.get('engine', 'unknown')}",
114
+ "score": _calculate_score(item),
115
+ }
116
+
117
+ # Extract date if available
118
+ published_date = item.get("publishedDate")
119
+ if published_date:
120
+ result["published_date"] = published_date
121
+
122
+ results.append(result)
123
+
124
+ return results
125
+
126
+
127
+ def _calculate_score(item: dict) -> float:
128
+ """Calculate initial score based on position and engine."""
129
+ # Base score from position (if available)
130
+ position = item.get("position", 10)
131
+ position_score = max(0.3, 1.0 - (position * 0.05))
132
+
133
+ # Bonus for certain engines
134
+ engine = item.get("engine", "").lower()
135
+ engine_bonus = {
136
+ "google": 0.1,
137
+ "bing": 0.05,
138
+ "duckduckgo": 0.05,
139
+ "wikipedia": 0.1,
140
+ "arxiv": 0.15,
141
+ "google scholar": 0.15,
142
+ }.get(engine, 0)
143
+
144
+ return min(1.0, position_score + engine_bonus)
145
+
146
+
147
+ async def get_searxng_engines(searxng_url: Optional[str] = None) -> list[str]:
148
+ """Get list of available engines from SearXNG instance."""
149
+ url = searxng_url or DEFAULT_SEARXNG_URL
150
+
151
+ try:
152
+ async with httpx.AsyncClient(timeout=10.0) as client:
153
+ response = await client.get(f"{url}/config")
154
+ response.raise_for_status()
155
+ data = response.json()
156
+
157
+ return [
158
+ engine["name"]
159
+ for engine in data.get("engines", [])
160
+ if not engine.get("disabled", False)
161
+ ]
162
+ except Exception:
163
+ return []