Shreyas94 commited on
Commit
e867839
Β·
verified Β·
1 Parent(s): 850efb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +760 -58
app.py CHANGED
@@ -1,64 +1,766 @@
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  gr.Slider(
53
- minimum=0.1,
54
  maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
 
58
  ),
59
- ],
60
- )
61
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
 
1
+ import asyncio
2
+ import aiohttp
3
  import gradio as gr
4
+ import json
5
+ import re
6
+ import time
7
+ from datetime import datetime
8
+ from typing import List, Dict, Optional, Tuple
9
+ from urllib.parse import quote_plus, urljoin
10
+ from dataclasses import dataclass
11
+ import numpy as np
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ import requests
15
+ from bs4 import BeautifulSoup
16
+ import newspaper
17
+ from newspaper import Article
18
+ import logging
19
+ import warnings
20
+
21
+ # Suppress warnings
22
+ warnings.filterwarnings("ignore")
23
+ logging.getLogger().setLevel(logging.ERROR)
24
+
25
+ @dataclass
26
+ class SearchResult:
27
+ """Data class for search results"""
28
+ title: str
29
+ url: str
30
+ snippet: str
31
+ content: str = ""
32
+ publication_date: Optional[str] = None
33
+ relevance_score: float = 0.0
34
+
35
+ class QueryEnhancer:
36
+ """Enhance user queries with search operators and entity quoting"""
37
+
38
+ def __init__(self):
39
+ # Common named entity patterns
40
+ self.entity_patterns = [
41
+ r'\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', # Proper names
42
+ r'\b[A-Z]{2,}(?:\s+[A-Z][a-z]+)*\b', # Acronyms + words
43
+ r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Inc|Corp|LLC|Ltd|Co|Company|Trust|Group|Holdings)\b' # Companies
44
+ ]
45
+
46
+ def enhance_query(self, query: str) -> str:
47
+ """Enhance query by quoting named entities and adding operators"""
48
+ enhanced = query
49
+
50
+ # Find and quote named entities
51
+ for pattern in self.entity_patterns:
52
+ matches = re.findall(pattern, enhanced)
53
+ for match in matches:
54
+ if len(match.split()) > 1: # Only quote multi-word entities
55
+ enhanced = enhanced.replace(match, f'"{match}"')
56
+
57
+ return enhanced
58
+
59
+ class SearchEngineInterface:
60
+ """Interface for different search engines"""
61
+
62
+ def __init__(self):
63
+ self.session = None
64
+ self.headers = {
65
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
66
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
67
+ 'Accept-Language': 'en-US,en;q=0.5',
68
+ 'Accept-Encoding': 'gzip, deflate',
69
+ 'Connection': 'keep-alive',
70
+ }
71
+
72
+ async def get_session(self):
73
+ """Get or create aiohttp session"""
74
+ if self.session is None:
75
+ connector = aiohttp.TCPConnector(limit=10)
76
+ timeout = aiohttp.ClientTimeout(total=30)
77
+ self.session = aiohttp.ClientSession(
78
+ headers=self.headers,
79
+ connector=connector,
80
+ timeout=timeout
81
+ )
82
+ return self.session
83
+
84
+ async def search_google(self, query: str, num_results: int = 10) -> List[SearchResult]:
85
+ """Search Google and parse results"""
86
+ try:
87
+ session = await self.get_session()
88
+ url = f"https://www.google.com/search?q={quote_plus(query)}&num={num_results}"
89
+
90
+ async with session.get(url) as response:
91
+ if response.status != 200:
92
+ return []
93
+
94
+ html = await response.text()
95
+ soup = BeautifulSoup(html, 'html.parser')
96
+ results = []
97
+
98
+ # Parse Google search results
99
+ for g in soup.find_all('div', class_='g')[:num_results]:
100
+ try:
101
+ title_elem = g.find('h3')
102
+ if not title_elem:
103
+ continue
104
+
105
+ title = title_elem.get_text()
106
+
107
+ # Get URL
108
+ link_elem = g.find('a')
109
+ if not link_elem or not link_elem.get('href'):
110
+ continue
111
+ url = link_elem['href']
112
+
113
+ # Get snippet
114
+ snippet_elem = g.find('span', class_=['st', 'aCOpRe'])
115
+ if not snippet_elem:
116
+ snippet_elem = g.find('div', class_=['s', 'st'])
117
+ snippet = snippet_elem.get_text() if snippet_elem else ""
118
+
119
+ if title and url.startswith('http'):
120
+ results.append(SearchResult(title=title, url=url, snippet=snippet))
121
+ except Exception as e:
122
+ continue
123
+
124
+ return results
125
+ except Exception as e:
126
+ print(f"Google search error: {e}")
127
+ return []
128
+
129
+ async def search_bing(self, query: str, num_results: int = 10) -> List[SearchResult]:
130
+ """Search Bing and parse results"""
131
+ try:
132
+ session = await self.get_session()
133
+ url = f"https://www.bing.com/search?q={quote_plus(query)}&count={num_results}"
134
+
135
+ async with session.get(url) as response:
136
+ if response.status != 200:
137
+ return []
138
+
139
+ html = await response.text()
140
+ soup = BeautifulSoup(html, 'html.parser')
141
+ results = []
142
+
143
+ # Parse Bing search results
144
+ for result in soup.find_all('li', class_='b_algo')[:num_results]:
145
+ try:
146
+ title_elem = result.find('h2')
147
+ if not title_elem:
148
+ continue
149
+
150
+ link_elem = title_elem.find('a')
151
+ if not link_elem:
152
+ continue
153
+
154
+ title = link_elem.get_text()
155
+ url = link_elem.get('href', '')
156
+
157
+ snippet_elem = result.find('p', class_='b_paractl') or result.find('div', class_='b_caption')
158
+ snippet = snippet_elem.get_text() if snippet_elem else ""
159
+
160
+ if title and url.startswith('http'):
161
+ results.append(SearchResult(title=title, url=url, snippet=snippet))
162
+ except Exception as e:
163
+ continue
164
+
165
+ return results
166
+ except Exception as e:
167
+ print(f"Bing search error: {e}")
168
+ return []
169
+
170
+ async def search_yahoo(self, query: str, num_results: int = 10) -> List[SearchResult]:
171
+ """Search Yahoo and parse results"""
172
+ try:
173
+ session = await self.get_session()
174
+ url = f"https://search.yahoo.com/search?p={quote_plus(query)}&n={num_results}"
175
+
176
+ async with session.get(url) as response:
177
+ if response.status != 200:
178
+ return []
179
+
180
+ html = await response.text()
181
+ soup = BeautifulSoup(html, 'html.parser')
182
+ results = []
183
+
184
+ # Parse Yahoo search results
185
+ for result in soup.find_all('div', class_='dd')[:num_results]:
186
+ try:
187
+ title_elem = result.find('h3', class_='title')
188
+ if not title_elem:
189
+ continue
190
+
191
+ link_elem = title_elem.find('a')
192
+ if not link_elem:
193
+ continue
194
+
195
+ title = link_elem.get_text()
196
+ url = link_elem.get('href', '')
197
+
198
+ snippet_elem = result.find('div', class_='compText')
199
+ snippet = snippet_elem.get_text() if snippet_elem else ""
200
+
201
+ if title and url.startswith('http'):
202
+ results.append(SearchResult(title=title, url=url, snippet=snippet))
203
+ except Exception as e:
204
+ continue
205
+
206
+ return results
207
+ except Exception as e:
208
+ print(f"Yahoo search error: {e}")
209
+ return []
210
+
211
+ async def close(self):
212
+ """Close the session"""
213
+ if self.session:
214
+ await self.session.close()
215
+
216
+ class ContentScraper:
217
+ """Scrape and parse article content using newspaper3k"""
218
+
219
+ def __init__(self):
220
+ self.session = None
221
+
222
+ async def get_session(self):
223
+ """Get or create aiohttp session"""
224
+ if self.session is None:
225
+ connector = aiohttp.TCPConnector(limit=20)
226
+ timeout = aiohttp.ClientTimeout(total=30)
227
+ self.session = aiohttp.ClientSession(
228
+ connector=connector,
229
+ timeout=timeout
230
+ )
231
+ return self.session
232
+
233
+ async def scrape_article(self, url: str) -> Tuple[str, Optional[str]]:
234
+ """Scrape article content and publication date"""
235
+ try:
236
+ # Use newspaper3k for article extraction
237
+ article = Article(url)
238
+ article.download()
239
+ article.parse()
240
+
241
+ content = article.text
242
+ pub_date = article.publish_date.isoformat() if article.publish_date else None
243
+
244
+ return content, pub_date
245
+ except Exception as e:
246
+ print(f"Error scraping {url}: {e}")
247
+ return "", None
248
+
249
+ async def scrape_multiple(self, search_results: List[SearchResult]) -> List[SearchResult]:
250
+ """Scrape multiple articles in parallel"""
251
+ tasks = []
252
+ for result in search_results:
253
+ tasks.append(self.scrape_article(result.url))
254
+
255
+ scraped_data = await asyncio.gather(*tasks, return_exceptions=True)
256
+
257
+ for i, (content, pub_date) in enumerate(scraped_data):
258
+ if not isinstance(content, Exception):
259
+ search_results[i].content = content
260
+ search_results[i].publication_date = pub_date
261
+
262
+ return search_results
263
+
264
+ async def close(self):
265
+ """Close the session"""
266
+ if self.session:
267
+ await self.session.close()
268
+
269
+ class EmbeddingFilter:
270
+ """Filter search results using embedding-based similarity"""
271
+
272
+ def __init__(self):
273
+ self.vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
274
+
275
+ def filter_by_relevance(self, query: str, search_results: List[SearchResult],
276
+ threshold: float = 0.1) -> List[SearchResult]:
277
+ """Filter results by cosine similarity with query"""
278
+ if not search_results:
279
+ return search_results
280
+
281
+ # Combine title, snippet, and content for each result
282
+ result_texts = []
283
+ for result in search_results:
284
+ combined_text = f"{result.title} {result.snippet} {result.content[:1000]}"
285
+ result_texts.append(combined_text)
286
+
287
+ if not result_texts:
288
+ return search_results
289
+
290
+ try:
291
+ # Add query to the corpus for vectorization
292
+ all_texts = [query] + result_texts
293
+
294
+ # Vectorize texts
295
+ tfidf_matrix = self.vectorizer.fit_transform(all_texts)
296
+
297
+ # Calculate cosine similarity between query and each result
298
+ query_vector = tfidf_matrix[0:1]
299
+ result_vectors = tfidf_matrix[1:]
300
+
301
+ similarities = cosine_similarity(query_vector, result_vectors)[0]
302
+
303
+ # Add relevance scores and filter
304
+ filtered_results = []
305
+ for i, result in enumerate(search_results):
306
+ result.relevance_score = similarities[i]
307
+ if similarities[i] >= threshold:
308
+ filtered_results.append(result)
309
+
310
+ # Sort by relevance score
311
+ filtered_results.sort(key=lambda x: x.relevance_score, reverse=True)
312
+ return filtered_results
313
+
314
+ except Exception as e:
315
+ print(f"Embedding filter error: {e}")
316
+ return search_results
317
+
318
+ class LLMSummarizer:
319
+ """Summarize search results using Groq or OpenRouter APIs"""
320
+
321
+ def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
322
+ self.groq_api_key = groq_api_key
323
+ self.openrouter_api_key = openrouter_api_key
324
+ self.groq_model = "meta-llama/llama-4-maverick-17b-128e-instruct"
325
+ self.openrouter_model = "deepseek/deepseek-r1:free"
326
+
327
+ def create_system_prompt(self) -> str:
328
+ """Create system prompt for summarization"""
329
+ return """You are an expert summarizer. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
330
+
331
+ Instructions:
332
+ 1. Focus only on information relevant to the user's query
333
+ 2. Filter out noise, advertisements, and unrelated content
334
+ 3. Synthesize information from multiple sources when possible
335
+ 4. Maintain factual accuracy and cite sources when appropriate
336
+ 5. If information is contradictory, note the discrepancies
337
+ 6. Provide a clear, concise summary that directly addresses the query
338
+ 7. Include relevant dates, numbers, and specific details when available
339
+
340
+ Format your response as a comprehensive summary, not bullet points."""
341
+
342
+ async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
343
+ temperature: float = 0.3, max_tokens: int = 2000) -> str:
344
+ """Summarize using Groq API"""
345
+ if not self.groq_api_key:
346
+ return "Groq API key not provided"
347
+
348
+ try:
349
+ # Prepare the content for summarization
350
+ content_json = {
351
+ "user_query": query,
352
+ "search_results": []
353
+ }
354
+
355
+ for result in search_results:
356
+ content_json["search_results"].append({
357
+ "title": result.title,
358
+ "url": result.url,
359
+ "snippet": result.snippet,
360
+ "content": result.content[:2000], # Limit content length
361
+ "publication_date": result.publication_date,
362
+ "relevance_score": result.relevance_score
363
+ })
364
+
365
+ user_prompt = f"""Please summarize the following search results for the query: "{query}"
366
+
367
+ Search Results Data:
368
+ {json.dumps(content_json, indent=2)}
369
+
370
+ Provide a comprehensive summary that directly answers the user's query based on the most relevant and recent information available."""
371
+
372
+ headers = {
373
+ "Authorization": f"Bearer {self.groq_api_key}",
374
+ "Content-Type": "application/json"
375
+ }
376
+
377
+ payload = {
378
+ "model": self.groq_model,
379
+ "messages": [
380
+ {"role": "system", "content": self.create_system_prompt()},
381
+ {"role": "user", "content": user_prompt}
382
+ ],
383
+ "temperature": temperature,
384
+ "max_tokens": max_tokens
385
+ }
386
+
387
+ async with aiohttp.ClientSession() as session:
388
+ async with session.post("https://api.groq.com/openai/v1/chat/completions",
389
+ headers=headers, json=payload) as response:
390
+ if response.status == 200:
391
+ result = await response.json()
392
+ return result["choices"][0]["message"]["content"]
393
+ else:
394
+ error_text = await response.text()
395
+ return f"Groq API error: {response.status} - {error_text}"
396
+
397
+ except Exception as e:
398
+ return f"Error with Groq summarization: {str(e)}"
399
+
400
+ async def summarize_with_openrouter(self, query: str, search_results: List[SearchResult],
401
+ temperature: float = 0.3, max_tokens: int = 2000) -> str:
402
+ """Summarize using OpenRouter API"""
403
+ if not self.openrouter_api_key:
404
+ return "OpenRouter API key not provided"
405
+
406
+ try:
407
+ # Prepare the content for summarization
408
+ content_json = {
409
+ "user_query": query,
410
+ "search_results": []
411
+ }
412
+
413
+ for result in search_results:
414
+ content_json["search_results"].append({
415
+ "title": result.title,
416
+ "url": result.url,
417
+ "snippet": result.snippet,
418
+ "content": result.content[:2000], # Limit content length
419
+ "publication_date": result.publication_date,
420
+ "relevance_score": result.relevance_score
421
+ })
422
+
423
+ user_prompt = f"""Please summarize the following search results for the query: "{query}"
424
+
425
+ Search Results Data:
426
+ {json.dumps(content_json, indent=2)}
427
+
428
+ Provide a comprehensive summary that directly answers the user's query based on the most relevant and recent information available."""
429
+
430
+ headers = {
431
+ "Authorization": f"Bearer {self.openrouter_api_key}",
432
+ "Content-Type": "application/json",
433
+ "HTTP-Referer": "https://huggingface.co/spaces",
434
+ "X-Title": "AI Search Engine"
435
+ }
436
+
437
+ payload = {
438
+ "model": self.openrouter_model,
439
+ "messages": [
440
+ {"role": "system", "content": self.create_system_prompt()},
441
+ {"role": "user", "content": user_prompt}
442
+ ],
443
+ "temperature": temperature,
444
+ "max_tokens": max_tokens
445
+ }
446
+
447
+ async with aiohttp.ClientSession() as session:
448
+ async with session.post("https://openrouter.ai/api/v1/chat/completions",
449
+ headers=headers, json=payload) as response:
450
+ if response.status == 200:
451
+ result = await response.json()
452
+ return result["choices"][0]["message"]["content"]
453
+ else:
454
+ error_text = await response.text()
455
+ return f"OpenRouter API error: {response.status} - {error_text}"
456
+
457
+ except Exception as e:
458
+ return f"Error with OpenRouter summarization: {str(e)}"
459
+
460
+ class AISearchEngine:
461
+ """Main AI-powered search engine class"""
462
+
463
+ def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
464
+ self.query_enhancer = QueryEnhancer()
465
+ self.search_interface = SearchEngineInterface()
466
+ self.content_scraper = ContentScraper()
467
+ self.embedding_filter = EmbeddingFilter()
468
+ self.llm_summarizer = LLMSummarizer(groq_api_key, openrouter_api_key)
469
+
470
+ async def search_and_summarize(self,
471
+ query: str,
472
+ search_engines: List[str],
473
+ model: str,
474
+ use_embeddings: bool,
475
+ temperature: float,
476
+ max_results: int,
477
+ max_tokens: int) -> Tuple[str, str]:
478
+ """Main search and summarization pipeline"""
479
+
480
+ start_time = time.time()
481
+ status_updates = []
482
+
483
+ try:
484
+ # Step 1: Query Enhancement
485
+ status_updates.append("πŸ” Enhancing search query...")
486
+ enhanced_query = self.query_enhancer.enhance_query(query)
487
+ status_updates.append(f"Enhanced query: {enhanced_query}")
488
+
489
+ # Step 2: Parallel Search across engines
490
+ status_updates.append("🌐 Searching across multiple engines...")
491
+ search_tasks = []
492
+
493
+ if "Google" in search_engines:
494
+ search_tasks.append(self.search_interface.search_google(enhanced_query, max_results))
495
+ if "Bing" in search_engines:
496
+ search_tasks.append(self.search_interface.search_bing(enhanced_query, max_results))
497
+ if "Yahoo" in search_engines:
498
+ search_tasks.append(self.search_interface.search_yahoo(enhanced_query, max_results))
499
+
500
+ if not search_tasks:
501
+ return "No search engines selected", "\n".join(status_updates)
502
+
503
+ search_results_lists = await asyncio.gather(*search_tasks)
504
+
505
+ # Combine and deduplicate results
506
+ all_results = []
507
+ seen_urls = set()
508
+
509
+ for results_list in search_results_lists:
510
+ for result in results_list:
511
+ if result.url not in seen_urls:
512
+ all_results.append(result)
513
+ seen_urls.add(result.url)
514
+
515
+ status_updates.append(f"Found {len(all_results)} unique results")
516
+
517
+ if not all_results:
518
+ return "No search results found", "\n".join(status_updates)
519
+
520
+ # Step 3: Content Scraping
521
+ status_updates.append("πŸ“„ Scraping article content...")
522
+ scraped_results = await self.content_scraper.scrape_multiple(all_results[:max_results])
523
+
524
+ # Filter results with content
525
+ results_with_content = [r for r in scraped_results if r.content.strip()]
526
+ status_updates.append(f"Successfully scraped {len(results_with_content)} articles")
527
+
528
+ # Step 4: Optional Embedding-based Filtering
529
+ if use_embeddings and results_with_content:
530
+ status_updates.append("🧠 Filtering results using embeddings...")
531
+ filtered_results = self.embedding_filter.filter_by_relevance(query, results_with_content)
532
+ status_updates.append(f"Filtered to {len(filtered_results)} most relevant results")
533
+ else:
534
+ filtered_results = results_with_content
535
+
536
+ if not filtered_results:
537
+ return "No relevant results found after filtering", "\n".join(status_updates)
538
+
539
+ # Step 5: LLM Summarization
540
+ status_updates.append(f"πŸ€– Generating summary using {model}...")
541
+
542
+ if model.startswith("Groq"):
543
+ summary = await self.llm_summarizer.summarize_with_groq(
544
+ query, filtered_results, temperature, max_tokens
545
+ )
546
+ else: # OpenRouter
547
+ summary = await self.llm_summarizer.summarize_with_openrouter(
548
+ query, filtered_results, temperature, max_tokens
549
+ )
550
+
551
+ # Add metadata
552
+ end_time = time.time()
553
+ processing_time = end_time - start_time
554
+
555
+ metadata = f"\n\n---\n**Search Metadata:**\n"
556
+ metadata += f"- Processing time: {processing_time:.2f} seconds\n"
557
+ metadata += f"- Results found: {len(all_results)}\n"
558
+ metadata += f"- Articles scraped: {len(results_with_content)}\n"
559
+ metadata += f"- Results used for summary: {len(filtered_results)}\n"
560
+ metadata += f"- Search engines: {', '.join(search_engines)}\n"
561
+ metadata += f"- Model: {model}\n"
562
+ metadata += f"- Embeddings used: {use_embeddings}\n"
563
+
564
+ final_summary = summary + metadata
565
+ status_updates.append(f"βœ… Summary generated in {processing_time:.2f}s")
566
+
567
+ return final_summary, "\n".join(status_updates)
568
+
569
+ except Exception as e:
570
+ error_msg = f"Error in search pipeline: {str(e)}"
571
+ status_updates.append(f"❌ {error_msg}")
572
+ return error_msg, "\n".join(status_updates)
573
+
574
+ finally:
575
+ # Cleanup
576
+ await self.search_interface.close()
577
+ await self.content_scraper.close()
578
+
579
+ # Global search engine instance
580
+ search_engine = None
581
+
582
+ async def initialize_search_engine(groq_key: str, openrouter_key: str):
583
+ """Initialize the search engine with API keys"""
584
+ global search_engine
585
+ search_engine = AISearchEngine(groq_key, openrouter_key)
586
+ return search_engine
587
+
588
+ async def perform_search(query: str,
589
+ search_engines: List[str],
590
+ model: str,
591
+ use_embeddings: bool,
592
+ temperature: float,
593
+ max_results: int,
594
+ max_tokens: int,
595
+ groq_key: str,
596
+ openrouter_key: str):
597
+ """Perform search with given parameters"""
598
+ global search_engine
599
+
600
+ if search_engine is None:
601
+ search_engine = await initialize_search_engine(groq_key, openrouter_key)
602
+
603
+ return await search_engine.search_and_summarize(
604
+ query, search_engines, model, use_embeddings,
605
+ temperature, max_results, max_tokens
606
+ )
607
+
608
+ async def chat_inference(message, history, groq_key, openrouter_key, model_choice, search_engines, use_embeddings, temperature, max_results, max_tokens):
609
+ """Main chat inference function for ChatInterface with additional inputs"""
610
+ try:
611
+ if not message.strip():
612
+ yield "Please enter a search query."
613
+ return
614
+
615
+ if not groq_key and not openrouter_key:
616
+ yield "❌ Please provide at least one API key (Groq or OpenRouter) to use the AI summarization features."
617
+ return
618
+
619
+ if not search_engines:
620
+ yield "❌ Please select at least one search engine."
621
+ return
622
+
623
+ # Initialize search engine
624
+ global search_engine
625
+ if search_engine is None:
626
+ search_engine = await initialize_search_engine(groq_key, openrouter_key)
627
+ else:
628
+ # Update API keys if they changed
629
+ search_engine.llm_summarizer.groq_api_key = groq_key
630
+ search_engine.llm_summarizer.openrouter_api_key = openrouter_key
631
+
632
+ # Start with status updates
633
+ yield "πŸ” Enhancing query and searching across multiple engines..."
634
+
635
+ # Small delay to show the initial status
636
+ await asyncio.sleep(0.1)
637
+
638
+ # Update status
639
+ yield "🌐 Fetching results from search engines..."
640
+ await asyncio.sleep(0.1)
641
+
642
+ # Update status
643
+ yield "πŸ“„ Scraping article content..."
644
+ await asyncio.sleep(0.1)
645
+
646
+ if use_embeddings:
647
+ yield "🧠 Filtering results using embeddings..."
648
+ await asyncio.sleep(0.1)
649
+
650
+ yield "πŸ€– Generating AI-powered summary..."
651
+ await asyncio.sleep(0.1)
652
+
653
+ # Perform the actual search and summarization
654
+ summary, status = await search_engine.search_and_summarize(
655
+ message,
656
+ search_engines,
657
+ model_choice,
658
+ use_embeddings,
659
+ temperature,
660
+ max_results,
661
+ max_tokens
662
+ )
663
+
664
+ # Stream the final result
665
+ yield summary
666
+
667
+ except Exception as e:
668
+ yield f"❌ Search failed: {str(e)}\n\nPlease check your API keys and try again."
669
+
670
+ def create_gradio_interface():
671
+ """Create the modern Gradio ChatInterface"""
672
+
673
+ # Define additional inputs for the accordion
674
+ additional_inputs = [
675
+ gr.Textbox(
676
+ label="πŸ”‘ Groq API Key",
677
+ type="password",
678
+ placeholder="Enter your Groq API key (get from: https://console.groq.com/)",
679
+ info="Required for Groq Llama-4 model"
680
+ ),
681
+ gr.Textbox(
682
+ label="πŸ”‘ OpenRouter API Key",
683
+ type="password",
684
+ placeholder="Enter your OpenRouter API key (get from: https://openrouter.ai/)",
685
+ info="Required for OpenRouter DeepSeek-R1 model"
686
+ ),
687
+ gr.Dropdown(
688
+ choices=["Groq (Llama-4)", "OpenRouter (DeepSeek-R1)"],
689
+ value="Groq (Llama-4)",
690
+ label="πŸ€– AI Model",
691
+ info="Choose the AI model for summarization"
692
+ ),
693
+ gr.CheckboxGroup(
694
+ choices=["Google", "Bing", "Yahoo"],
695
+ value=["Google", "Bing"],
696
+ label="πŸ” Search Engines",
697
+ info="Select which search engines to use (multiple recommended)"
698
+ ),
699
+ gr.Checkbox(
700
+ value=True,
701
+ label="🧠 Use Embedding-based Filtering",
702
+ info="Filter results by relevance using TF-IDF similarity (recommended)"
703
+ ),
704
  gr.Slider(
705
+ minimum=0.0,
706
  maximum=1.0,
707
+ value=0.3,
708
+ step=0.1,
709
+ label="🌑️ Temperature",
710
+ info="Higher = more creative, Lower = more focused (0.1-0.3 recommended for factual queries)"
711
  ),
712
+ gr.Slider(
713
+ minimum=5,
714
+ maximum=20,
715
+ value=10,
716
+ step=1,
717
+ label="πŸ“Š Max Results per Engine",
718
+ info="Number of search results to fetch from each engine"
719
+ ),
720
+ gr.Slider(
721
+ minimum=500,
722
+ maximum=4000,
723
+ value=2000,
724
+ step=100,
725
+ label="πŸ“ Max Tokens",
726
+ info="Maximum length of the AI-generated summary"
727
+ )
728
+ ]
729
+
730
+ # Create the main ChatInterface
731
+ chat_interface = gr.ChatInterface(
732
+ fn=chat_inference,
733
+ additional_inputs=additional_inputs,
734
+ additional_inputs_accordion=gr.Accordion("βš™οΈ Configuration & Advanced Parameters", open=True),
735
+ title="πŸ” AI-Powered Search Engine",
736
+ description="""
737
+ **Search across Google, Bing, and Yahoo, then get AI-powered summaries!**
738
+
739
+ ✨ **Features:** Multi-engine search β€’ Query enhancement β€’ Parallel scraping β€’ AI summarization β€’ Embedding filtering
740
+
741
+ πŸ“‹ **Quick Start:** 1) Add your API key below 2) Select search engines 3) Ask any question!
742
+ """,
743
+ cache_examples=False,
744
+ #retry_btn="πŸ”„ Retry",
745
+ #undo_btn="↩️ Undo",
746
+ #clear_btn="πŸ—‘οΈ Clear",
747
+ submit_btn="πŸ” Search & Summarize",
748
+ stop_btn="⏹️ Stop",
749
+ chatbot=gr.Chatbot(
750
+ show_copy_button=True,
751
+ #likeable=True,
752
+ layout="bubble",
753
+ height=600,
754
+ placeholder="πŸš€ Ready to search! Configure your settings below and ask me anything.",
755
+ show_share_button=True
756
+ ),
757
+ theme=gr.themes.Soft(),
758
+ analytics_enabled=False,
759
+ type="messages" # Use the modern message format
760
+ )
761
+
762
+ return chat_interface
763
 
764
  if __name__ == "__main__":
765
+ demo = create_gradio_interface()
766
+ demo.launch(share=True)