Shreyas94 commited on
Commit
73b3250
·
verified ·
1 Parent(s): 9d35d68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -1061
app.py CHANGED
@@ -1,1069 +1,37 @@
1
- import asyncio
2
- import aiohttp
3
  import gradio as gr
4
- import json
5
- import re
6
- import time
7
- from datetime import datetime
8
- from typing import List, Dict, Optional, Tuple
9
- from urllib.parse import quote_plus, urljoin
10
- from dataclasses import dataclass
11
- import numpy as np
12
- from sklearn.metrics.pairwise import cosine_similarity
13
- from sklearn.feature_extraction.text import TfidfVectorizer
14
- import requests
15
- from bs4 import BeautifulSoup
16
- import newspaper
17
- from newspaper import Article
18
- import logging
19
- import warnings
20
 
21
- # Suppress warnings
22
- warnings.filterwarnings("ignore")
23
- logging.getLogger().setLevel(logging.ERROR)
24
 
25
- @dataclass
26
- class SearchResult:
27
- """Data class for search results"""
28
- title: str
29
- url: str
30
- snippet: str
31
- content: str = ""
32
- publication_date: Optional[str] = None
33
- relevance_score: float = 0.0
34
-
35
- class QueryEnhancer:
36
- """Enhance user queries with search operators and entity quoting"""
37
-
38
- def __init__(self):
39
- # Common named entity patterns
40
- self.entity_patterns = [
41
- r'\b[A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', # Proper names
42
- r'\b[A-Z]{2,}(?:\s+[A-Z][a-z]+)*\b', # Acronyms + words
43
- r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Inc|Corp|LLC|Ltd|Co|Company|Trust|Group|Holdings)\b' # Companies
44
- ]
45
-
46
- def enhance_query(self, query: str) -> str:
47
- """Enhance query by quoting named entities and adding operators"""
48
- enhanced = query
49
-
50
- # Find and quote named entities
51
- for pattern in self.entity_patterns:
52
- matches = re.findall(pattern, enhanced)
53
- for match in matches:
54
- if len(match.split()) > 1: # Only quote multi-word entities
55
- enhanced = enhanced.replace(match, f'"{match}"')
56
-
57
- return enhanced
58
-
59
- class SearchEngineInterface:
60
- """Interface for different search engines"""
61
-
62
- def __init__(self):
63
- self.session = None
64
- self.headers = {
65
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
66
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
67
- 'Accept-Language': 'en-US,en;q=0.9',
68
- 'Accept-Encoding': 'gzip, deflate, br',
69
- 'Connection': 'keep-alive',
70
- 'Upgrade-Insecure-Requests': '1',
71
- 'Sec-Fetch-Dest': 'document',
72
- 'Sec-Fetch-Mode': 'navigate',
73
- 'Sec-Fetch-Site': 'none',
74
- 'Sec-Fetch-User': '?1',
75
- 'Cache-Control': 'max-age=0',
76
- }
77
-
78
- async def get_session(self):
79
- """Get or create aiohttp session with better configuration"""
80
- if self.session is None or self.session.closed:
81
- connector = aiohttp.TCPConnector(
82
- limit=20,
83
- limit_per_host=5,
84
- ttl_dns_cache=300,
85
- use_dns_cache=True,
86
- keepalive_timeout=30,
87
- enable_cleanup_closed=True
88
- )
89
- timeout = aiohttp.ClientTimeout(total=45, connect=15, sock_read=30)
90
- self.session = aiohttp.ClientSession(
91
- headers=self.headers,
92
- connector=connector,
93
- timeout=timeout,
94
- trust_env=True
95
- )
96
- return self.session
97
-
98
- async def search_google(self, query: str, num_results: int = 10) -> List[SearchResult]:
99
- """Search Google and parse results"""
100
- try:
101
- session = await self.get_session()
102
- url = f"https://www.google.com/search?q={quote_plus(query)}&num={num_results}"
103
-
104
- async with session.get(url) as response:
105
- if response.status != 200:
106
- return []
107
-
108
- html = await response.text()
109
- soup = BeautifulSoup(html, 'html.parser')
110
- results = []
111
-
112
- # Parse Google search results
113
- for g in soup.find_all('div', class_='g')[:num_results]:
114
- try:
115
- title_elem = g.find('h3')
116
- if not title_elem:
117
- continue
118
-
119
- title = title_elem.get_text()
120
-
121
- # Get URL
122
- link_elem = g.find('a')
123
- if not link_elem or not link_elem.get('href'):
124
- continue
125
- url = link_elem['href']
126
-
127
- # Get snippet
128
- snippet_elem = g.find('span', class_=['st', 'aCOpRe'])
129
- if not snippet_elem:
130
- snippet_elem = g.find('div', class_=['s', 'st'])
131
- snippet = snippet_elem.get_text() if snippet_elem else ""
132
-
133
- if title and url.startswith('http'):
134
- results.append(SearchResult(title=title, url=url, snippet=snippet))
135
- except Exception as e:
136
- continue
137
-
138
- return results
139
- except Exception as e:
140
- print(f"Google search error: {e}")
141
- return []
142
-
143
- async def search_bing(self, query: str, num_results: int = 10) -> List[SearchResult]:
144
- """Search Bing and parse results"""
145
- try:
146
- session = await self.get_session()
147
- url = f"https://www.bing.com/search?q={quote_plus(query)}&count={num_results}"
148
-
149
- async with session.get(url) as response:
150
- if response.status != 200:
151
- return []
152
-
153
- html = await response.text()
154
- soup = BeautifulSoup(html, 'html.parser')
155
- results = []
156
-
157
- # Parse Bing search results
158
- for result in soup.find_all('li', class_='b_algo')[:num_results]:
159
- try:
160
- title_elem = result.find('h2')
161
- if not title_elem:
162
- continue
163
-
164
- link_elem = title_elem.find('a')
165
- if not link_elem:
166
- continue
167
-
168
- title = link_elem.get_text()
169
- url = link_elem.get('href', '')
170
-
171
- snippet_elem = result.find('p', class_='b_paractl') or result.find('div', class_='b_caption')
172
- snippet = snippet_elem.get_text() if snippet_elem else ""
173
-
174
- if title and url.startswith('http'):
175
- results.append(SearchResult(title=title, url=url, snippet=snippet))
176
- except Exception as e:
177
- continue
178
-
179
- return results
180
- except Exception as e:
181
- print(f"Bing search error: {e}")
182
- return []
183
-
184
- async def search_yahoo(self, query: str, num_results: int = 10) -> List[SearchResult]:
185
- """Search Yahoo and parse results"""
186
- try:
187
- session = await self.get_session()
188
- url = f"https://search.yahoo.com/search?p={quote_plus(query)}&n={num_results}"
189
-
190
- async with session.get(url) as response:
191
- if response.status != 200:
192
- return []
193
-
194
- html = await response.text()
195
- soup = BeautifulSoup(html, 'html.parser')
196
- results = []
197
-
198
- # Parse Yahoo search results
199
- for result in soup.find_all('div', class_='dd')[:num_results]:
200
- try:
201
- title_elem = result.find('h3', class_='title')
202
- if not title_elem:
203
- continue
204
-
205
- link_elem = title_elem.find('a')
206
- if not link_elem:
207
- continue
208
-
209
- title = link_elem.get_text()
210
- url = link_elem.get('href', '')
211
-
212
- snippet_elem = result.find('div', class_='compText')
213
- snippet = snippet_elem.get_text() if snippet_elem else ""
214
-
215
- if title and url.startswith('http'):
216
- results.append(SearchResult(title=title, url=url, snippet=snippet))
217
- except Exception as e:
218
- continue
219
-
220
- return results
221
- except Exception as e:
222
- print(f"Yahoo search error: {e}")
223
- return []
224
-
225
- async def close(self):
226
- """Close the session safely"""
227
- if self.session and not self.session.closed:
228
- await self.session.close()
229
- # Wait a bit for the underlying connections to close
230
- await asyncio.sleep(0.1)
231
-
232
- class ContentScraper:
233
- """Scrape and parse article content using newspaper3k with robust error handling"""
234
-
235
- def __init__(self):
236
- self.session = None
237
- self.headers = {
238
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
239
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
240
- 'Accept-Language': 'en-US,en;q=0.9',
241
- 'Accept-Encoding': 'gzip, deflate, br',
242
- 'Connection': 'keep-alive',
243
- 'Upgrade-Insecure-Requests': '1',
244
- 'Sec-Fetch-Dest': 'document',
245
- 'Sec-Fetch-Mode': 'navigate',
246
- 'Sec-Fetch-Site': 'cross-site',
247
- 'Sec-Fetch-User': '?1',
248
- 'Cache-Control': 'no-cache',
249
- 'Pragma': 'no-cache'
250
- }
251
- # Domains known to block scrapers - we'll handle these differently
252
- self.blocked_domains = {
253
- 'bloomberg.com', 'wsj.com', 'ft.com', 'nytimes.com',
254
- 'washingtonpost.com', 'economist.com', 'reuters.com'
255
- }
256
-
257
- async def get_session(self):
258
- """Get or create aiohttp session with robust configuration"""
259
- if self.session is None or self.session.closed:
260
- connector = aiohttp.TCPConnector(
261
- limit=30,
262
- limit_per_host=10,
263
- ttl_dns_cache=300,
264
- use_dns_cache=True,
265
- keepalive_timeout=60,
266
- enable_cleanup_closed=True,
267
- ssl=False # Disable SSL verification for problematic sites
268
- )
269
- timeout = aiohttp.ClientTimeout(total=60, connect=20, sock_read=40)
270
- self.session = aiohttp.ClientSession(
271
- headers=self.headers,
272
- connector=connector,
273
- timeout=timeout,
274
- trust_env=True
275
- )
276
- return self.session
277
-
278
- def is_blocked_domain(self, url: str) -> bool:
279
- """Check if domain is known to block scrapers"""
280
- from urllib.parse import urlparse
281
- try:
282
- domain = urlparse(url).netloc.lower()
283
- return any(blocked in domain for blocked in self.blocked_domains)
284
- except:
285
- return False
286
-
287
- async def scrape_article_fallback(self, url: str) -> Tuple[str, Optional[str]]:
288
- """Enhanced fallback scraping method using direct HTTP request"""
289
- try:
290
- session = await self.get_session()
291
-
292
- # Add random delay to avoid rate limiting
293
- await asyncio.sleep(0.2)
294
-
295
- async with session.get(url, allow_redirects=True) as response:
296
- if response.status != 200:
297
- return "", None
298
-
299
- html = await response.text()
300
- soup = BeautifulSoup(html, 'html.parser')
301
-
302
- # Remove unwanted elements
303
- for unwanted in soup(["script", "style", "nav", "header", "footer", "aside", "iframe", "noscript"]):
304
- unwanted.decompose()
305
-
306
- # Try multiple content extraction strategies
307
- content = ""
308
-
309
- # Strategy 1: Look for common article content containers
310
- content_selectors = [
311
- # Generic selectors
312
- 'article', '[role="main"]', 'main', '.main-content', '.content',
313
- # News-specific selectors
314
- '.story-body', '.article-body', '.entry-content', '.post-content',
315
- '.article-content', '.story-content', '.news-content',
316
- # Site-specific selectors
317
- '[data-module="ArticleBody"]', '.RichTextStoryBody', '.InlineVideo',
318
- '.zone-content', '.field-name-body', '.story-text',
319
- # CNN specific
320
- '.zn-body__paragraph', '.zn-body-text',
321
- # Fox News specific
322
- '.article-body', '.article-text',
323
- # NBC specific
324
- '.articleText', '.inline-story-content',
325
- # AP News specific
326
- '.Article', '.RichTextStoryBody',
327
- # BBC specific
328
- '[data-component="text-block"]', '.ssrcss-1q0x1qg-Paragraph',
329
- # Generic fallbacks
330
- '.text', '.body', '[class*="content"]', '[class*="article"]', '[class*="story"]'
331
- ]
332
-
333
- for selector in content_selectors:
334
- try:
335
- elements = soup.select(selector)
336
- if elements:
337
- texts = []
338
- for elem in elements:
339
- text = elem.get_text(separator=' ', strip=True)
340
- if len(text) > 50: # Only meaningful content
341
- texts.append(text)
342
-
343
- if texts:
344
- content = ' '.join(texts)
345
- if len(content) > 200: # Good content found
346
- break
347
- except:
348
- continue
349
-
350
- # Strategy 2: If no structured content, get all paragraphs
351
- if not content or len(content) < 100:
352
- paragraphs = soup.find_all('p')
353
- p_texts = []
354
- for p in paragraphs:
355
- text = p.get_text(strip=True)
356
- # Filter out short paragraphs, likely navigation/ads
357
- if len(text) > 30 and not any(skip in text.lower() for skip in
358
- ['cookie', 'advertisement', 'subscribe', 'newsletter',
359
- 'follow us', 'social media', 'share this']):
360
- p_texts.append(text)
361
-
362
- if p_texts:
363
- content = ' '.join(p_texts)
364
-
365
- # Strategy 3: Extract from divs with text content
366
- if not content or len(content) < 100:
367
- divs = soup.find_all('div')
368
- div_texts = []
369
- for div in divs:
370
- # Only direct text, not nested
371
- text = div.get_text(separator=' ', strip=True)
372
- if 100 < len(text) < 1000: # Reasonable paragraph length
373
- # Check if it's likely article content
374
- if any(word in text.lower() for word in ['said', 'according', 'reported', 'stated', 'announced']):
375
- div_texts.append(text)
376
-
377
- if div_texts:
378
- content = ' '.join(div_texts[:3]) # Take first 3 relevant divs
379
-
380
- # Try to extract publication date
381
- pub_date = None
382
- date_selectors = [
383
- 'time[datetime]', '[datetime]',
384
- '.published-date', '.post-date', '.article-date',
385
- '.timestamp', '.date', '.publish-date',
386
- '[data-testid="timestamp"]', '.byline-timestamp',
387
- '.story-date', '.news-date'
388
- ]
389
-
390
- for selector in date_selectors:
391
- try:
392
- date_elem = soup.select_one(selector)
393
- if date_elem:
394
- pub_date = (date_elem.get('datetime') or
395
- date_elem.get('content') or
396
- date_elem.get_text(strip=True))
397
- if pub_date:
398
- break
399
- except:
400
- continue
401
-
402
- # Don't limit content length here - let LLM handle full content
403
- if content:
404
- # Remove excessive whitespace
405
- content = ' '.join(content.split())
406
-
407
- return content, pub_date
408
-
409
- except Exception as e:
410
- print(f"Enhanced fallback scraping failed for {url}: {str(e)[:100]}...")
411
- return "", None
412
-
413
- async def scrape_article(self, url: str) -> Tuple[str, Optional[str]]:
414
- """Scrape article content with multiple fallback strategies"""
415
- content = ""
416
- pub_date = None
417
-
418
- # Method 1: Try newspaper3k first (simple approach)
419
- try:
420
- article = Article(url)
421
- article.download()
422
- article.parse()
423
-
424
- if article.text and len(article.text.strip()) > 100:
425
- content = article.text.strip() # Don't limit content length
426
- pub_date = article.publish_date.isoformat() if article.publish_date else None
427
- return content, pub_date
428
-
429
- except Exception as e:
430
- print(f"Newspaper3k failed for {url}: {str(e)[:100]}...")
431
-
432
- # Method 2: Fallback to direct HTTP scraping
433
- try:
434
- content, pub_date = await self.scrape_article_fallback(url)
435
- if content and len(content.strip()) > 50:
436
- return content, pub_date
437
- except Exception as e:
438
- print(f"Fallback scraping failed for {url}: {str(e)[:100]}...")
439
-
440
- # Method 3: Last resort - try to get at least the title/snippet
441
- try:
442
- session = await self.get_session()
443
- async with session.get(url, allow_redirects=True) as response:
444
- if response.status == 200:
445
- html = await response.text()
446
- soup = BeautifulSoup(html, 'html.parser')
447
-
448
- # Get at least the title and meta description
449
- title = soup.find('title')
450
- title_text = title.get_text().strip() if title else ""
451
-
452
- meta_desc = soup.find('meta', attrs={'name': 'description'})
453
- desc_text = meta_desc.get('content', '').strip() if meta_desc else ""
454
-
455
- if title_text or desc_text:
456
- content = f"{title_text}. {desc_text}".strip()
457
- return content, None
458
-
459
- except Exception as e:
460
- print(f"Last resort scraping failed for {url}: {str(e)[:100]}...")
461
-
462
- return "", None
463
-
464
- async def scrape_multiple(self, search_results: List[SearchResult], max_successful: int = None) -> List[SearchResult]:
465
- """Scrape multiple articles with robust error handling and retry logic"""
466
- if not search_results:
467
- return search_results
468
-
469
- max_successful = max_successful or len(search_results)
470
- successful_scraped = 0
471
- semaphore = asyncio.Semaphore(5) # Limit concurrent requests
472
-
473
- async def scrape_with_semaphore(result: SearchResult) -> SearchResult:
474
- nonlocal successful_scraped
475
-
476
- if successful_scraped >= max_successful:
477
- return result
478
-
479
- async with semaphore:
480
- try:
481
- # Skip if already have enough successful results
482
- if successful_scraped >= max_successful:
483
- return result
484
-
485
- content, pub_date = await self.scrape_article(result.url)
486
-
487
- if content and len(content.strip()) > 50:
488
- result.content = content
489
- result.publication_date = pub_date
490
- successful_scraped += 1
491
- print(f"✅ Successfully scraped: {result.url[:60]}...")
492
- else:
493
- print(f"⚠️ No content extracted from: {result.url[:60]}...")
494
-
495
- except Exception as e:
496
- print(f"❌ Failed to scrape {result.url[:60]}...: {e}")
497
-
498
- return result
499
-
500
- # Process all URLs but stop when we have enough successful results
501
- tasks = []
502
- for result in search_results:
503
- if successful_scraped < max_successful:
504
- tasks.append(scrape_with_semaphore(result))
505
- else:
506
- break
507
-
508
- if tasks:
509
- scraped_results = await asyncio.gather(*tasks, return_exceptions=True)
510
-
511
- # Filter out exceptions and return successful results
512
- valid_results = []
513
- for result in scraped_results:
514
- if not isinstance(result, Exception):
515
- valid_results.append(result)
516
- else:
517
- valid_results = search_results
518
-
519
- # Return results with content first, then others
520
- results_with_content = [r for r in valid_results if r.content.strip()]
521
- results_without_content = [r for r in valid_results if not r.content.strip()]
522
-
523
- print(f"📊 Scraping summary: {len(results_with_content)} successful, {len(results_without_content)} failed")
524
-
525
- return results_with_content + results_without_content
526
-
527
- async def close(self):
528
- """Close the session"""
529
- if self.session:
530
- await self.session.close()
531
-
532
- class EmbeddingFilter:
533
- """Filter search results using embedding-based similarity"""
534
-
535
- def __init__(self):
536
- self.vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
537
-
538
- def filter_by_relevance(self, query: str, search_results: List[SearchResult],
539
- threshold: float = 0.1) -> List[SearchResult]:
540
- """Filter results by cosine similarity with query"""
541
- if not search_results:
542
- return search_results
543
-
544
- # Combine title, snippet, and content for each result
545
- result_texts = []
546
- for result in search_results:
547
- combined_text = f"{result.title} {result.snippet} {result.content[:1000]}"
548
- result_texts.append(combined_text)
549
-
550
- if not result_texts:
551
- return search_results
552
-
553
- try:
554
- # Add query to the corpus for vectorization
555
- all_texts = [query] + result_texts
556
-
557
- # Vectorize texts
558
- tfidf_matrix = self.vectorizer.fit_transform(all_texts)
559
-
560
- # Calculate cosine similarity between query and each result
561
- query_vector = tfidf_matrix[0:1]
562
- result_vectors = tfidf_matrix[1:]
563
-
564
- similarities = cosine_similarity(query_vector, result_vectors)[0]
565
-
566
- # Add relevance scores and filter
567
- filtered_results = []
568
- for i, result in enumerate(search_results):
569
- result.relevance_score = similarities[i]
570
- if similarities[i] >= threshold:
571
- filtered_results.append(result)
572
-
573
- # Sort by relevance score
574
- filtered_results.sort(key=lambda x: x.relevance_score, reverse=True)
575
- return filtered_results
576
-
577
- except Exception as e:
578
- print(f"Embedding filter error: {e}")
579
- return search_results
580
-
581
- class LLMSummarizer:
582
- """Improved summarizer without content validation filtering - sends all scraped content to LLM"""
583
-
584
- def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
585
- self.groq_api_key = groq_api_key
586
- self.openrouter_api_key = openrouter_api_key
587
- self.groq_model = "meta-llama/llama-4-maverick-17b-128e-instruct"
588
- self.openrouter_model = "deepseek/deepseek-r1:free"
589
-
590
- def create_system_prompt(self) -> str:
591
- """Create system prompt for summarization"""
592
- return """You are an expert research assistant. Your task is to analyze search results and provide a comprehensive, accurate summary that directly answers the user's query.
593
-
594
- CRITICAL INSTRUCTIONS:
595
- 1. Analyze ALL provided content carefully and thoroughly
596
- 2. Extract and synthesize any information relevant to answering the user's question
597
- 3. Include specific facts, dates, numbers, and quotes when present
598
- 4. If information is contradictory between sources, mention this
599
- 5. Cite sources by mentioning the publication or website name
600
- 6. Be thorough and detailed in your analysis
601
- 7. If some content seems tangentially related, still include relevant portions
602
- 8. Focus on directly answering the user's query with the most relevant information first
603
-
604
- Format your response as a comprehensive summary, not bullet points. Provide a thorough analysis of all the content provided."""
605
-
606
- def prepare_content_for_llm(self, query: str, search_results: List[SearchResult]) -> str:
607
- """Prepare content for LLM without validation filtering - include ALL scraped content"""
608
-
609
- # No content validation - include all results that have any content
610
- valid_results = [result for result in search_results if result.content.strip()]
611
-
612
- if not valid_results:
613
- return f"""Query: "{query}"
614
-
615
- No content was successfully scraped from the search results. This might be due to anti-bot protections or network issues."""
616
-
617
- content_parts = [f'User Query: "{query}"\n']
618
- content_parts.append(f"Number of sources with content: {len(valid_results)}\n")
619
-
620
- for i, result in enumerate(valid_results, 1):
621
- content_parts.append(f"=== SOURCE {i} ===")
622
- content_parts.append(f"Title: {result.title}")
623
- content_parts.append(f"URL: {result.url}")
624
-
625
- if result.publication_date:
626
- content_parts.append(f"Date: {result.publication_date}")
627
-
628
- if result.relevance_score > 0:
629
- content_parts.append(f"Relevance Score: {result.relevance_score:.3f}")
630
-
631
- # Include snippet if it's different from content start
632
- if result.snippet and not result.content.startswith(result.snippet[:50]):
633
- content_parts.append(f"Snippet: {result.snippet}")
634
-
635
- # Include FULL content without truncation - let the LLM handle the large context
636
- content = result.content.strip()
637
- content_parts.append(f"Content: {content}")
638
- content_parts.append("") # Empty line between sources
639
-
640
- return "\n".join(content_parts)
641
-
642
- async def summarize_with_groq(self, query: str, search_results: List[SearchResult],
643
- temperature: float = 0.3, max_tokens: int = 8000) -> str:
644
- """Enhanced Groq summarization with increased token limits and no content filtering"""
645
- if not self.groq_api_key:
646
- return "Groq API key not provided"
647
-
648
- try:
649
- # Prepare content without validation filtering
650
- prepared_content = self.prepare_content_for_llm(query, search_results)
651
-
652
- # Debug output
653
- print(f"DEBUG - Sending {len(prepared_content)} characters to Groq AI")
654
- print(f"DEBUG - Results with content: {len([r for r in search_results if r.content])}")
655
- print(f"DEBUG - Max completion tokens: {max_tokens}")
656
-
657
- user_prompt = f"""Please analyze the following search results and provide a comprehensive summary that directly answers the user's query.
658
-
659
- {prepared_content}
660
-
661
- Instructions:
662
- - Focus on information relevant to the query: "{query}"
663
- - Analyze ALL provided content thoroughly
664
- - Be specific and factual, include dates/numbers when available
665
- - Mention source publications when referencing information
666
- - If results contain limited relevant information, state this clearly but still extract what you can
667
- - Provide a comprehensive analysis of all available content"""
668
-
669
- headers = {
670
- "Authorization": f"Bearer {self.openrouter_api_key}",
671
- "Content-Type": "application/json",
672
- "HTTP-Referer": "https://huggingface.co/spaces",
673
- "X-Title": "AI Search Engine"
674
- }
675
-
676
- payload = {
677
- "model": self.openrouter_model,
678
- "messages": [
679
- {"role": "system", "content": self.create_system_prompt()},
680
- {"role": "user", "content": user_prompt}
681
- ],
682
- "temperature": temperature,
683
- "max_tokens": max_tokens
684
- }
685
-
686
- async with aiohttp.ClientSession() as session:
687
- async with session.post("https://openrouter.ai/api/v1/chat/completions",
688
- headers=headers, json=payload) as response:
689
- if response.status == 200:
690
- result = await response.json()
691
- summary = result["choices"][0]["message"]["content"]
692
-
693
- # Add debug info
694
- debug_info = f"\n\n[Content Sources: {len([r for r in search_results if r.content])} with content, {len(search_results)} total]"
695
- return summary + debug_info
696
-
697
- else:
698
- error_text = await response.text()
699
- return f"OpenRouter API error: {response.status} - {error_text}"
700
-
701
- except Exception as e:
702
- return f"Error with OpenRouter summarization: {str(e)}"
703
-
704
- class AISearchEngine:
705
- """Main AI-powered search engine class"""
706
-
707
- def __init__(self, groq_api_key: str = "", openrouter_api_key: str = ""):
708
- self.query_enhancer = QueryEnhancer()
709
- self.search_interface = SearchEngineInterface()
710
- self.content_scraper = ContentScraper()
711
- self.embedding_filter = EmbeddingFilter()
712
- self.llm_summarizer = LLMSummarizer(groq_api_key, openrouter_api_key)
713
-
714
- async def search_and_summarize(self,
715
- query: str,
716
- search_engines: List[str],
717
- model: str,
718
- use_embeddings: bool,
719
- temperature: float,
720
- max_results: int,
721
- max_tokens: int) -> Tuple[str, str]:
722
- """Main search and summarization pipeline with robust error handling"""
723
-
724
- start_time = time.time()
725
- status_updates = []
726
-
727
- try:
728
- # Step 1: Query Enhancement
729
- status_updates.append("🔍 Enhancing search query...")
730
- enhanced_query = self.query_enhancer.enhance_query(query)
731
- status_updates.append(f"Enhanced query: {enhanced_query}")
732
-
733
- # Step 2: Parallel Search across engines
734
- status_updates.append("🌐 Searching across multiple engines...")
735
- search_tasks = []
736
-
737
- if "Google" in search_engines:
738
- search_tasks.append(self.search_interface.search_google(enhanced_query, max_results))
739
- if "Bing" in search_engines:
740
- search_tasks.append(self.search_interface.search_bing(enhanced_query, max_results))
741
- if "Yahoo" in search_engines:
742
- search_tasks.append(self.search_interface.search_yahoo(enhanced_query, max_results))
743
-
744
- if not search_tasks:
745
- return "No search engines selected", "\n".join(status_updates)
746
-
747
- search_results_lists = await asyncio.gather(*search_tasks, return_exceptions=True)
748
-
749
- # Combine and deduplicate results, handling exceptions
750
- all_results = []
751
- seen_urls = set()
752
-
753
- for results_list in search_results_lists:
754
- if not isinstance(results_list, Exception) and results_list:
755
- for result in results_list:
756
- if result.url not in seen_urls and result.url.startswith('http'):
757
- all_results.append(result)
758
- seen_urls.add(result.url)
759
-
760
- status_updates.append(f"Found {len(all_results)} unique results")
761
-
762
- if not all_results:
763
- return "No search results found. This might be due to rate limiting or network issues. Please try again.", "\n".join(status_updates)
764
-
765
- # Step 3: Content Scraping with intelligent retry and fallback
766
- status_updates.append("📄 Scraping article content...")
767
-
768
- # Prioritize results and scrape intelligently
769
- target_successful = min(max_results, len(all_results))
770
- scraped_results = await self.content_scraper.scrape_multiple(
771
- all_results[:max_results * 2], # Try more URLs to ensure we get enough content
772
- max_successful=target_successful
773
- )
774
-
775
- # Include ALL results with any content (no filtering)
776
- results_with_content = [r for r in scraped_results if r.content.strip()]
777
- status_updates.append(f"Successfully scraped {len(results_with_content)} articles with content")
778
-
779
- # Debug: Show what content we actually got
780
- for i, result in enumerate(results_with_content[:3]):
781
- print(f"Result {i+1}: {result.title}")
782
- print(f"Content length: {len(result.content)}")
783
- print(f"Content preview: {result.content[:200]}...")
784
- print("---")
785
-
786
- # If we don't have enough content, try to get some from snippets
787
- if len(results_with_content) < 3:
788
- status_updates.append("Using search snippets as fallback content...")
789
- for result in scraped_results:
790
- if not result.content.strip() and result.snippet.strip():
791
- result.content = result.snippet
792
- results_with_content.append(result)
793
- if len(results_with_content) >= 5: # Reasonable minimum
794
- break
795
-
796
- if not results_with_content:
797
- return "No article content could be extracted. This might be due to anti-bot protections. Please try a different query or try again later.", "\n".join(status_updates)
798
-
799
- # Step 4: Optional Embedding-based Filtering
800
- if use_embeddings and results_with_content:
801
- status_updates.append("🧠 Filtering results using embeddings...")
802
- try:
803
- filtered_results = self.embedding_filter.filter_by_relevance(query, results_with_content)
804
- if filtered_results:
805
- results_with_content = filtered_results
806
- status_updates.append(f"Filtered to {len(filtered_results)} most relevant results")
807
- else:
808
- status_updates.append("Embedding filter returned no results, using all scraped content")
809
- except Exception as e:
810
- status_updates.append(f"Embedding filtering failed, using all results: {str(e)}")
811
-
812
- if not results_with_content:
813
- return "No relevant results found after filtering", "\n".join(status_updates)
814
-
815
- # Step 5: LLM Summarization - now sends ALL content without validation filtering
816
- status_updates.append(f"🤖 Generating summary using {model} (processing all scraped content)...")
817
-
818
- try:
819
- if model.startswith("Groq"):
820
- summary = await self.llm_summarizer.summarize_with_groq(
821
- query, results_with_content, temperature, max_tokens
822
- )
823
- else: # OpenRouter
824
- summary = await self.llm_summarizer.summarize_with_openrouter(
825
- query, results_with_content, temperature, max_tokens
826
- )
827
-
828
- # Check if summarization failed
829
- if summary.startswith("Error") or summary.startswith("Groq API error") or summary.startswith("OpenRouter API error"):
830
- # Provide a basic summary from the content
831
- basic_summary = self.create_basic_summary(query, results_with_content)
832
- summary = f"AI summarization failed, but here's what I found:\n\n{basic_summary}\n\n---\n⚠️ Original error: {summary}"
833
-
834
- except Exception as e:
835
- # Fallback to basic summary
836
- basic_summary = self.create_basic_summary(query, results_with_content)
837
- summary = f"AI summarization encountered an error, but here's what I found:\n\n{basic_summary}\n\n---\n⚠️ Error: {str(e)}"
838
-
839
- # Add metadata
840
- end_time = time.time()
841
- processing_time = end_time - start_time
842
-
843
- metadata = f"\n\n---\n**Search Metadata:**\n"
844
- metadata += f"- Processing time: {processing_time:.2f} seconds\n"
845
- metadata += f"- Results found: {len(all_results)}\n"
846
- metadata += f"- Articles scraped: {len(results_with_content)}\n"
847
- metadata += f"- Search engines: {', '.join(search_engines)}\n"
848
- metadata += f"- Model: {model}\n"
849
- metadata += f"- Embeddings used: {use_embeddings}\n"
850
- metadata += f"- Content filtering: DISABLED (all content sent to LLM)\n"
851
-
852
- final_summary = summary + metadata
853
- status_updates.append(f"✅ Summary generated in {processing_time:.2f}s")
854
-
855
- return final_summary, "\n".join(status_updates)
856
-
857
- except Exception as e:
858
- error_msg = f"Error in search pipeline: {str(e)}"
859
- status_updates.append(f"❌ {error_msg}")
860
- return error_msg, "\n".join(status_updates)
861
-
862
- finally:
863
- # Cleanup - but don't close sessions immediately to allow reuse
864
- try:
865
- # Don't close sessions here as they might be reused
866
- pass
867
- except Exception as e:
868
- print(f"Cleanup error: {e}")
869
-
870
- def create_basic_summary(self, query: str, results: List[SearchResult]) -> str:
871
- """Create a basic summary when AI summarization fails"""
872
- summary_parts = [f"Based on search results for: **{query}**\n"]
873
-
874
- for i, result in enumerate(results[:5], 1):
875
- content_preview = result.content[:300] + "..." if len(result.content) > 300 else result.content
876
- summary_parts.append(f"**{i}. {result.title}**")
877
- summary_parts.append(f"Source: {result.url}")
878
- if result.publication_date:
879
- summary_parts.append(f"Date: {result.publication_date}")
880
- summary_parts.append(f"Content: {content_preview}")
881
- summary_parts.append("")
882
-
883
- return "\n".join(summary_parts)
884
-
885
- # Global search engine instance
886
- search_engine = None
887
-
888
- async def initialize_search_engine(groq_key: str, openrouter_key: str):
889
- """Initialize the search engine with API keys"""
890
- global search_engine
891
- search_engine = AISearchEngine(groq_key, openrouter_key)
892
- return search_engine
893
-
894
- async def perform_search(query: str,
895
- search_engines: List[str],
896
- model: str,
897
- use_embeddings: bool,
898
- temperature: float,
899
- max_results: int,
900
- max_tokens: int,
901
- groq_key: str,
902
- openrouter_key: str):
903
- """Perform search with given parameters"""
904
- global search_engine
905
-
906
- if search_engine is None:
907
- search_engine = await initialize_search_engine(groq_key, openrouter_key)
908
-
909
- return await search_engine.search_and_summarize(
910
- query, search_engines, model, use_embeddings,
911
- temperature, max_results, max_tokens
912
- )
913
-
914
- async def chat_inference(message, history, groq_key, openrouter_key, model_choice, search_engines, use_embeddings, temperature, max_results, max_tokens):
915
- """Main chat inference function for ChatInterface with additional inputs"""
916
  try:
917
- if not message.strip():
918
- yield "Please enter a search query."
919
- return
920
-
921
- if not groq_key and not openrouter_key:
922
- yield "❌ Please provide at least one API key (Groq or OpenRouter) to use the AI summarization features."
923
- return
924
-
925
- if not search_engines:
926
- yield "❌ Please select at least one search engine."
927
- return
928
-
929
- # Initialize search engine
930
- global search_engine
931
- if search_engine is None:
932
- search_engine = await initialize_search_engine(groq_key, openrouter_key)
933
- else:
934
- # Update API keys if they changed
935
- search_engine.llm_summarizer.groq_api_key = groq_key
936
- search_engine.llm_summarizer.openrouter_api_key = openrouter_key
937
-
938
- # Start with status updates
939
- yield "🔍 Enhancing query and searching across multiple engines..."
940
-
941
- # Small delay to show the initial status
942
- await asyncio.sleep(0.1)
943
-
944
- # Update status
945
- yield "🌐 Fetching results from search engines..."
946
- await asyncio.sleep(0.1)
947
-
948
- # Update status
949
- yield "📄 Scraping article content..."
950
- await asyncio.sleep(0.1)
951
-
952
- if use_embeddings:
953
- yield "🧠 Filtering results using embeddings..."
954
- await asyncio.sleep(0.1)
955
-
956
- yield "🤖 Generating AI-powered summary (processing all scraped content)..."
957
- await asyncio.sleep(0.1)
958
-
959
- # Perform the actual search and summarization
960
- summary, status = await search_engine.search_and_summarize(
961
- message,
962
- search_engines,
963
- model_choice,
964
- use_embeddings,
965
- temperature,
966
- max_results,
967
- max_tokens
968
  )
969
-
970
- # Stream the final result
971
- yield summary
972
-
973
  except Exception as e:
974
- yield f" Search failed: {str(e)}\n\nPlease check your API keys and try again."
975
-
976
- def create_gradio_interface():
977
- """Create the modern Gradio ChatInterface"""
978
-
979
- # Define additional inputs for the accordion
980
- additional_inputs = [
981
- gr.Textbox(
982
- label="🔑 Groq API Key",
983
- type="password",
984
- placeholder="Enter your Groq API key (get from: https://console.groq.com/)",
985
- info="Required for Groq Llama-4 model"
986
- ),
987
- gr.Textbox(
988
- label="🔑 OpenRouter API Key",
989
- type="password",
990
- placeholder="Enter your OpenRouter API key (get from: https://openrouter.ai/)",
991
- info="Required for OpenRouter DeepSeek-R1 model"
992
- ),
993
- gr.Dropdown(
994
- choices=["Groq (Llama-4)", "OpenRouter (DeepSeek-R1)"],
995
- value="Groq (Llama-4)",
996
- label="🤖 AI Model",
997
- info="Choose the AI model for summarization"
998
- ),
999
- gr.CheckboxGroup(
1000
- choices=["Google", "Bing", "Yahoo"],
1001
- value=["Google", "Bing"],
1002
- label="🔍 Search Engines",
1003
- info="Select which search engines to use (multiple recommended)"
1004
- ),
1005
- gr.Checkbox(
1006
- value=True,
1007
- label="🧠 Use Embedding-based Filtering",
1008
- info="Filter results by relevance using TF-IDF similarity (recommended)"
1009
- ),
1010
- gr.Slider(
1011
- minimum=0.0,
1012
- maximum=1.0,
1013
- value=0.3,
1014
- step=0.1,
1015
- label="🌡️ Temperature",
1016
- info="Higher = more creative, Lower = more focused (0.1-0.3 recommended for factual queries)"
1017
- ),
1018
- gr.Slider(
1019
- minimum=5,
1020
- maximum=20,
1021
- value=10,
1022
- step=1,
1023
- label="📊 Max Results per Engine",
1024
- info="Number of search results to fetch from each engine"
1025
- ),
1026
- gr.Slider(
1027
- minimum=1000,
1028
- maximum=8000,
1029
- value=8000,
1030
- step=500,
1031
- label="📝 Max Completion Tokens",
1032
- info="Maximum length of the AI-generated summary (Groq: up to 8000, OpenRouter: up to 4000)"
1033
- )
1034
- ]
1035
-
1036
- # Create the main ChatInterface
1037
- chat_interface = gr.ChatInterface(
1038
- fn=chat_inference,
1039
- additional_inputs=additional_inputs,
1040
- additional_inputs_accordion=gr.Accordion("⚙️ Configuration & Advanced Parameters", open=True),
1041
- title="🔍 AI-Powered Search Engine - No Content Filtering",
1042
- description="""
1043
- **Search across Google, Bing, and Yahoo, then get AI-powered summaries!**
1044
-
1045
- ✨ **Features:** Multi-engine search • Query enhancement • Parallel scraping • AI summarization • Embedding filtering
1046
- 🚀 **Updated:** All scraped content is now sent to the LLM without filtering • Increased Groq token limits (up to 8K)
1047
-
1048
- 📋 **Quick Start:** 1) Add your API key below 2) Select search engines 3) Ask any question!
1049
- """,
1050
- cache_examples=False,
1051
- submit_btn="🔍 Search & Summarize",
1052
- stop_btn="⏹️ Stop",
1053
- chatbot=gr.Chatbot(
1054
- show_copy_button=True,
1055
- layout="bubble",
1056
- height=600,
1057
- placeholder="🚀 Ready to search! All scraped content will be sent to the LLM for comprehensive analysis.",
1058
- show_share_button=True
1059
- ),
1060
- theme=gr.themes.Soft(),
1061
- analytics_enabled=False,
1062
- type="messages" # Use the modern message format
1063
- )
1064
-
1065
- return chat_interface
1066
 
1067
  if __name__ == "__main__":
1068
- demo = create_gradio_interface()
1069
- demo.launch(share=True)
 
1
+ import os
 
2
  import gradio as gr
3
+ from groq import Groq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # Set up Groq client
6
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
 
7
 
8
+ # Function to handle user input
9
+ def chat_inference(message, history):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  try:
11
+ # Call compound-beta model
12
+ response = client.chat.completions.create(
13
+ messages=[{"role": "user", "content": message}],
14
+ model="compound-beta"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  )
16
+ reply = response.choices[0].message.content
 
 
 
17
  except Exception as e:
18
+ reply = f"⚠️ Error: {str(e)}"
19
+ return reply
20
+
21
+ # Optional configuration inputs (can be expanded)
22
+ additional_inputs = [
23
+ gr.Textbox(label="🔍 Example Prompt", value="What were the main highlights from the latest Apple keynote?")
24
+ ]
25
+
26
+ # Gradio ChatInterface
27
+ chat_interface = gr.ChatInterface(
28
+ fn=chat_inference,
29
+ additional_inputs=additional_inputs,
30
+ additional_inputs_accordion=gr.Accordion("⚙️ Configuration & Advanced Parameters", open=True),
31
+ title="🔍 AI-Powered Real-Time Search with Groq",
32
+ description="Ask anything that requires real-time info — powered by Groq’s blazing fast `compound-beta` model with built-in web search.",
33
+ theme="default",
34
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  if __name__ == "__main__":
37
+ chat_interface.launch()