Spaces:

SmartHeal
/

NewsLetter

Sleeping

File size: 8,316 Bytes

a19173c
 
 
f38704f
 
 
 
a19173c
 
f38704f
 
 
 
 
b98cf69
a19173c
f38704f
 
 
b98cf69
a19173c
f38704f
 
 
 
 
b98cf69
f38704f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac2933e
 
f38704f
 
 
 
 
a19173c
 
b0776ed
a19173c
 
b0776ed
a19173c
b98cf69
f38704f
 
 
 
 
 
 
 
b98cf69
f38704f
 
 
 
 
b98cf69
ac2933e
f38704f
 
b98cf69
a19173c
f38704f
a19173c
 
 
 
b98cf69
f38704f
 
ac2933e
f38704f
 
 
 
 
 
 
 
 
 
 
b0776ed
b98cf69
f38704f
b98cf69
f38704f
a19173c
 
f38704f
a19173c
b98cf69
a19173c
 
 
 
 
 
 
 
f38704f
a19173c
f38704f
 
 
 
a19173c
f38704f
 
b98cf69
f38704f
 
 
 
 
 
 
 
 
ac2933e
f38704f
b98cf69
f38704f
 
 
 
 
ac2933e
 
f38704f
 
b98cf69
f38704f
 
 
ac2933e
 
b98cf69
f38704f
b98cf69
 
f38704f
 
 
 
 
 
 
ac2933e
b98cf69
f38704f
 
 
a19173c
b0776ed
 
 
a19173c
b0776ed
 
 
a19173c
 
b0776ed
 
 
a19173c
b0776ed
 
 
a19173c
 
b98cf69
 
a19173c
 
f38704f
b98cf69
 
f38704f
a19173c
f38704f
b0776ed
 
 
b98cf69

import os
import time
import logging
import random
import platform
from typing import Dict, List, Optional, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from utils.web_scraper import get_website_text_content

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

class ResearchEngine:
    """Enhanced research engine with improved scraping and error handling"""

    _MAX_SCRAPERS: int = 5

    def __init__(self):
        self.google_api_key = os.getenv("GOOGLE_API_KEY", "").strip().replace("\\n", "").replace("\n", "")
        self.google_cx = os.getenv("GOOGLE_CX", "").strip().replace("\\n", "").replace("\n", "")

        if not (self.google_api_key and self.google_cx):
            raise EnvironmentError("GOOGLE_API_KEY and GOOGLE_CX must be set (no trailing newlines)")

        try:
            self.google_service = build("customsearch", "v1", developerKey=self.google_api_key)
            logging.info("Google Search service initialized successfully")
        except Exception as e:
            logging.error("Error initializing Google Custom Search: %s", e)
            raise

        self.session = requests.Session()
        retries = Retry(
            total=3,
            backoff_factor=0.5,
            status_forcelist=[413, 429, 500, 502, 503, 504],
            respect_retry_after_header=True
        )
        adapter = HTTPAdapter(pool_connections=self._MAX_SCRAPERS,
                              pool_maxsize=self._MAX_SCRAPERS,
                              max_retries=retries, pool_block=True)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)

        self.user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
            "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0"
        ]

    def search_multiple_sources(self, query: str, context: str) -> Dict[str, Any]:
        now = time.time()
        results = {
            'google_results': [],
            'news_results': [],
            'scraped_content': [],
            'sources': [],
            'metadata': {}
        }

        try:
            data = self._search_google(query, context)
            results['google_results'] = data['items']
            results['sources'].extend(data['sources'])
            logging.info("Google search gave %d results", len(data['items']))
        except Exception as e:
            logging.error("Google Search API error: %s", e)
            raise RuntimeError("Google Search API failure")

        news = self._search_news_api(query)
        if news:
            results['news_results'] = news.get('articles', [])
            results['sources'].extend([a.get('url') for a in news['articles'] if a.get('url')])
            logging.info("Added %d news sources", len(results['news_results']))

        unique_urls = list(dict.fromkeys(results['sources']))
        urls = unique_urls[:self._MAX_SCRAPERS]
        results['scraped_content'] = list(self._parallel_scrape(urls))

        results['metadata'] = {
            'search_timestamp': now,
            'total_sources': len(results['sources']),
            'scraped_count': len(results['scraped_content'])
        }
        return results

    def _search_google(self, query: str, context: str) -> Dict[str, Any]:
        professional_query = f"{query} {context}"
        resp = self.google_service.cse().list(q=professional_query, cx=self.google_cx, num=10).execute()

        items = []
        sources = []

        for x in resp.get("items", []):
            title = x.get("title", "")
            snippet = x.get("snippet", "")
            link = x.get("link")
            display = x.get("displayLink", "")
            if link:
                items.append({'title': title, 'snippet': snippet, 'link': link, 'displayLink': display})
                sources.append(link)

        return {'items': items, 'sources': sources}

    def _search_news_api(self, query: str) -> Optional[Dict[str, Any]]:
        api_key = os.getenv("NEWS_API_KEY")
        if not api_key:
            logging.warning("No NEWS_API_KEY – skipping News search")
            return None

        try:
            url = "https://newsapi.org/v2/everything"
            params = {
                'q': query,
                'apiKey': api_key,
                'sortBy': 'relevancy',
                'pageSize': 20,
                'language': 'en',
                'from': time.strftime('%Y-%m-%d', time.gmtime(time.time() - 30 * 24 * 3600))
            }
            r = self.session.get(url, params=params, timeout=(3, 10))
            r.raise_for_status()
            return r.json()

        except Exception as e:
            logging.error("News API error: %s", e)
            return None

    def _parallel_scrape(self, urls: List[str]) -> List[Dict[str, Any]]:
        out = []
        future_to_url = {}

        def crawl(u: str):
            headers = {
                "User-Agent": random.choice(self.user_agents),
                "Accept-Language": "en-US,en;q=0.9",
                "Accept": "text/html,application/xhtml+xml;q=0.9,image/webp,*/*;q=0.8",
                "Connection": "keep-alive"
            }
            try:
                resp = self.session.get(u, headers=headers, timeout=(3, 10))
                if resp.status_code == 403:
                    logging.warning("Scraping blocked (403) for %s", u)
                    return None
                resp.raise_for_status()
                if resp.encoding.lower() in ("iso-8859-1", "latin-1"):
                    resp.encoding = resp.apparent_encoding
                text = resp.text or ""
                if len(text) < 100:
                    return None
                return {'url': u, 'content': text[:2000], 'timestamp': time.time()}
            except requests.exceptions.ReadTimeout:
                logging.warning("Timeout scraping %s", u)
            except UnicodeEncodeError:
                return {'url': u, 'content': resp.content.decode('utf-8', errors='replace')[:2000], 'timestamp': time.time()}
            except Exception as e:
                logging.warning("Scraping failure for %s: %s", u, e)
            return None

        with ThreadPoolExecutor(max_workers=self._MAX_SCRAPERS) as ex:
            for u in urls:
                future_to_url[ex.submit(crawl, u)] = u
            for f in as_completed(future_to_url):
                res = f.result()
                if res:
                    out.append(res)
                time.sleep(random.uniform(0.5, 1.0))

        return out

    def extract_key_data_points(self, research_results: Dict[str, Any]) -> List[Dict[str, Any]]:
        data_points = []
        for itm in research_results.get('google_results', []):
            val = self._extract_numbers_and_stats(itm.get('snippet', ""))
            if val:
                data_points.append({
                    'value': val,
                    'source': itm.get('displayLink', ""),
                    'context': itm.get('snippet', ""),
                    'type': 'statistic'
                })
        for cnt in research_results.get('scraped_content', []):
            val = self._extract_numbers_and_stats(cnt.get('content', ""))
            if val:
                data_points.append({
                    'value': val,
                    'source': cnt.get('url', ""),
                    'context': cnt.get('content', "")[:200],
                    'type': 'detailed_analysis'
                })
        return data_points[:10]

    def _extract_numbers_and_stats(self, text: str) -> Optional[str]:
        import re
        patlist = [
            r'\$[\d,]+(?:\.\d+)?(?:\s*(?:billion|million|trillion))?',
            r'\d+(?:\.\d+)?%',
            r'\d{1,3}(?:,\d{3})*(?:\.\d+)?(?:\s*(?:billion|million|thousand))?'
        ]
        for pat in patlist:
            m = re.search(pat, text, re.IGNORECASE)
            if m:
                return m.group()
        return None