Spaces:

Tanxshh
/

greenintellect

Sleeping

File size: 15,919 Bytes

02cc7f6

import asyncio
import os
import requests
import logging
from fake_useragent import UserAgent
try:
    from ddgs import DDGS
except ImportError:
    from duckduckgo_search import DDGS
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium_stealth import stealth
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

ua = UserAgent()

# Progress tracking
progress_callback = None

def set_progress_callback(callback):
    """Set a callback function to report progress"""
    global progress_callback
    progress_callback = callback

def report_progress(message, percentage):
    """Report progress if callback is set"""
    if progress_callback:
        progress_callback(message, percentage)
    print(f"[{percentage}%] {message}")

def setup_selenium_driver():
    """Setup a stealth Selenium driver with HuggingFace/Docker compatibility"""
    options = Options()
    options.add_argument("--headless=new")  # New headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-infobars")
    options.add_argument("--window-size=1920,1080")
    options.add_argument(f"user-agent={ua.random}")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    # Check if running in Docker/HuggingFace environment
    is_docker = os.path.exists("/.dockerenv") or os.environ.get("HF_SPACE_ID")
    
    driver = None
    
    if is_docker:
        logger.info("Running in Docker/HuggingFace environment, using system Chromium")
        # Use system Chromium in Docker
        chromium_paths = ["/usr/bin/chromium", "/usr/bin/chromium-browser", "/usr/bin/google-chrome"]
        chromedriver_paths = ["/usr/bin/chromedriver", "/usr/local/bin/chromedriver"]
        
        for chromium_path in chromium_paths:
            if os.path.exists(chromium_path):
                options.binary_location = chromium_path
                logger.info(f"Using Chromium at: {chromium_path}")
                break
        
        try:
            # Try with system chromedriver first
            for chromedriver_path in chromedriver_paths:
                if os.path.exists(chromedriver_path):
                    service = Service(chromedriver_path)
                    driver = webdriver.Chrome(service=service, options=options)
                    logger.info(f"Using chromedriver at: {chromedriver_path}")
                    break
            
            if driver is None:
                # Fallback to webdriver_manager
                service = Service(ChromeDriverManager().install())
                driver = webdriver.Chrome(service=service, options=options)
        except Exception as e:
            logger.error(f"Docker Chrome setup failed: {e}")
            # Final fallback - try default Chrome
            try:
                driver = webdriver.Chrome(options=options)
            except Exception as e2:
                logger.error(f"All Chrome drivers failed: {e2}")
                raise
    else:
        # Local development - use webdriver_manager
        try:
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=options)
        except Exception as e:
            logger.error(f"Failed to initialize Chrome driver with manager: {e}")
            driver = webdriver.Chrome(options=options)

    # Apply stealth settings
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )
            
    return driver

async def scrape_url_selenium(url):
    """Scrape a URL using Selenium Stealth for better evasion"""
    logger.info(f"Scraping with Selenium: {url}")
    try:
        def _selenium_task():
            driver = setup_selenium_driver()
            try:
                driver.get(url)
                # Wait for some content (simple sleep for now, could be improved with WebDriverWait)
                import time
                time.sleep(3) 
                content = driver.page_source
                return content
            finally:
                driver.quit()
        
        content = await asyncio.to_thread(_selenium_task)
        
        # Parse with BS4 to get clean text
        soup = BeautifulSoup(content, 'html.parser')
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        text = soup.get_text(separator=' ', strip=True)
        return text, content
        
    except Exception as e:
        logger.error(f"Selenium scraping failed for {url}: {e}")
        return "", ""

async def search_web(query, max_results=5):
    """
    Search the web using DuckDuckGo (no API key required)
    """
    try:
        results = []
        # specific implementation for DuckDuckGo might need sync wrapper if library is sync-only
        # DDGS().text() is synchronous generator
        
        def run_search():
            with DDGS() as ddgs:
                return list(ddgs.text(query, max_results=max_results))
                
        # Run sync search in thread
        search_results = await asyncio.to_thread(run_search)
        
        for res in search_results:
            results.append({
                "title": res.get('title', ''),
                "url": res.get('href', ''),
                "content": res.get('body', ''),
                "query_type": "web_search"
            })
            
        return results
    except Exception as e:
        print(f"Search error for '{query}': {e}")
        return []

async def get_news_from_api(company_name):
    """
    Use NewsAPI for reliable news collection
    """
    api_key = os.getenv('NEWS_API_KEY')
    if not api_key:
        return []
    
    try:
        url = f"https://newsapi.org/v2/everything"
        params = {
            'q': f'{company_name} AND (sustainability OR greenwashing OR ESG OR environmental)',
            'language': 'en',
            'sortBy': 'relevancy',
            'pageSize': 15,
            'apiKey': api_key
        }
        
        # Requests is blocking, so we run it in a thread to verify
        response = await asyncio.to_thread(requests.get, url, params=params, timeout=10)
        data = response.json()
        
        if data.get('status') == 'ok':
            articles = []
            for article in data.get('articles', []): 
                # Filter out removed content
                if article.get('title') == '[Removed]': continue

                # KEYWORD FILTERS (Same as Web Search)
                title_lower = (article.get('title') or "").lower()
                desc_lower = (article.get('description') or "").lower()
                text_to_check = title_lower + " " + desc_lower
                
                # 1. NEGATIVE FILTER: Exclude crime/fraud
                bad_keywords = ["fraud", "arrest", "scam", "police", "laundering", "jail", "cbi", "ed", "bribe", "punish", "litigation"]
                if any(bad in title_lower for bad in bad_keywords):
                    continue
                    
                # 2. POSITIVE FILTER: Must have ESG context (If query logic fails)
                # NewsAPI query already has keywords, but let's double check to be safe
                pass # Relying on API query "AND (sustainability OR ...)" for now

                
                articles.append({
                    'url': article.get('url', ''),
                    'title': article.get('title', ''),
                    'content': (article.get('description') or '') + ' ' + (article.get('content') or ''),
                    'query_type': 'news_api'
                })
            return articles
    except Exception as e:
        print(f"NewsAPI error: {e}")
    
    return []

# Helper for Filtering
def is_valid_result(res):
    """Filter out navigational, login, and irrelevant links"""
    url = res.get('url', '').lower()
    title = res.get('title', '').lower()
    content = res.get('content', '').lower()
    
    # 1. Exclude generic Google/Navigational links
    invalid_domains = ['google.com/search', 'google.com/url', 'accounts.google.com', 'support.google.com', 
                       'youtube.com', 'facebook.com', 'twitter.com/login', 'linkedin.com/login']
    
    # 2. Exclude actions
    invalid_terms = ['sign in', 'log in', 'forgot password', 'download', 'captcha', 'security check', 'robot', 'access denied']
    
    if any(d in url for d in invalid_domains): return False
    if any(t in title for t in invalid_terms): return False
    
    # 3. Minimum content length/quality (for reviews)
    # if len(content) < 20: return False # Optional rule
    
    return True

async def get_company_news(company_name):
    """Get news using NewsAPI and DuckDuckGo Fallback"""
    report_progress(f"Starting news collection for {company_name}", 10)
    
    articles = []
    # 1. Try NewsAPI (Limit increased to 20)
    report_progress("Checking NewsAPI...", 15)
    api_articles = await get_news_from_api(company_name)
    articles.extend(api_articles)
    
    # 2. Add Web Search (DuckDuckGo) for deeper coverage
    report_progress("Fetching additional news via Web Search...", 25)
    
    queries = [
        f'"{company_name}" environmental impact report news',
        f'"{company_name}" greenwashing controversy scandal',
        f'"{company_name}" sustainability goals criticism',
        f'"{company_name}" ESG rating news detected',
        f'"{company_name}" climate change commitments review' 
    ]
    
    # ESG/Climate Keywords (Refined to avoid generic matches)
    ESG_KEYWORDS = [
        "climate", "carbon", "emission", "pollution", "sustainability", "esg",  
        "renewable", "net zero", "biodiversity", "ecological", "greenhouse", "fossil fuel"
    ]
    # "green" and "environment" removed as they match "green light", "business environment"

    # Negative Keywords to exclude financial crime/generic news
    NEGATIVE_KEYWORDS = ["fraud", "arrest", "scam", "police", "laundering", "jail", "cbi", "ed", "bribe"]

    for query in queries:
        if len(articles) >= 20: break 
        
        results = await search_web(query, max_results=5) 
        for res in results:
            if not is_valid_result(res): continue 
            
            # Combine Title + Body for checking
            text_to_check = (res.get('title', '') + " " + res.get('body', '')).lower()
            title_lower = res.get('title', '').lower()

            # 1. NEGATIVE FILTER: Exclude crime/fraud immediately
            if any(bad in title_lower for bad in NEGATIVE_KEYWORDS):
                continue

            # 2. POSITIVE FILTER: Must have ESG context
            # Re-adding "environmental" specifically (not just environment)
            if "environmental" in text_to_check: pass 
            elif not any(k in text_to_check for k in ESG_KEYWORDS):
                continue # Skip if no environmental context found
            
            # Simple de-duplication
            if not any(a['url'] == res['url'] for a in articles):
                articles.append(res)
    
    report_progress(f"News collection complete: {len(articles)} articles", 45)
    return articles[:20] 

async def get_company_reviews(company_name):
    """Get reviews using Web Search (Glassdoor, Reddit, etc.)"""
    report_progress(f"Starting review collection for {company_name}", 50)
    
    reviews = []
    
    # Using site: operators to force specific sources
    queries = [
        f'site:glassdoor.com "{company_name}" reviews "environment" OR "sustainability"',
        f'site:reddit.com "{company_name}" greenwashing OR "toxic"',
        f'site:trustpilot.com "{company_name}" environment',
        f'"{company_name}" employee reviews sustainability ethics',
        f'"{company_name}" environmental controversy reviews', # Broad fallback
        f'"{company_name}" corporate responsibility feedback'   # Broad fallback
    ]
    
    total_queries = len(queries)
    for idx, query in enumerate(queries):
        progress = 50 + (idx / total_queries) * 30
        report_progress(f"Searching specific reviews: {query}", int(progress))
        
        results = await search_web(query, max_results=8)
        
        for res in results:
            if len(reviews) >= 40: break
            if not is_valid_result(res): continue # FILTER HERE

            # RELEVANCE CHECK (Strict)
            # Ensure company name is actually mentioned in title or snippet
            c_name_lower = company_name.lower()
            res_content = (res.get('title', '') + " " + res.get('content', '')).lower()
            
            # Simple substring match (can be improved with fuzzy later if needed)
            if c_name_lower not in res_content and c_name_lower.split()[0] not in res_content:  
                # Try strict full name, then at least first word (e.g. "Google" in "Google Inc")
                # But careful with generic first words like "The" or "Green"
                if len(c_name_lower.split()[0]) > 3:
                     if c_name_lower.split()[0] not in res_content:
                         print(f"Skipping unrelated result: {res['title']}")
                         continue
                else:
                     continue # Too short, require full name match
            
            # Determine source type based on URL
            source = "web"
            if "glassdoor" in res['url']: source = "Glassdoor"
            elif "twitter" in res['url'] or "x.com" in res['url']: source = "Twitter"
            elif "linkedin" in res['url']: source = "LinkedIn"
            elif "reddit" in res['url']: source = "Reddit"
            elif "trustpilot" in res['url']: source = "Trustpilot"
            
            # Clean title
            title = res['title'].replace(" | Glassdoor", "").replace(" | Reddit", "")
            
            reviews.append({
                "url": res['url'],
                "title": title,
                "content": res['content'], # Use the snippet as the review content
                "source_type": source
            })
        
        await asyncio.sleep(1)

    # If few reviews found, try a broader fallback
    if len(reviews) < 3:
         report_progress("Few reviews found, trying specific broader query...", 75)
         fallback_results = await search_web(f'"{company_name}" reviews environment', max_results=5)
         for res in fallback_results:
             if is_valid_result(res) and not any(r['url'] == res['url'] for r in reviews):
                 # RELEVANCE CHECK
                 c_name_lower = company_name.lower()
                 res_content = (res.get('title', '') + " " + res.get('content', '')).lower()
                 if c_name_lower not in res_content and c_name_lower.split()[0] not in res_content:
                     if len(c_name_lower.split()[0]) > 3:
                        if c_name_lower.split()[0] not in res_content: continue
                     else: continue

                 reviews.append({
                    "url": res['url'],
                    "title": res['title'],
                    "content": res['content'],
                    "source_type": "Web Search"
                })

    report_progress(f"Review collection complete: {len(reviews)} reviews", 80)
    return reviews

    # NO MOCK DATA FALLBACK
    return reviews