Spaces:

Tanxshh
/

greenintellect

Sleeping

File size: 18,951 Bytes

02cc7f6

from datetime import datetime
from .pdf_processor import extract_text_from_pdf, split_sentences, clean_text
from .scraper import get_company_news, get_company_reviews, report_progress
from .scoring import calculate_scores, analyze_sentiment, analyze_aspect_sentiment, calculate_vague_score, calculate_concrete_score
from .llm_generator import generate_company_description, generate_ai_recommendations

# Aspect Keywords
EMISSION_KEYWORDS = ['emission', 'carbon', 'co2', 'greenhouse', 'pollution', 'net zero', 'carbon neutral']
ENERGY_KEYWORDS = ['energy', 'renewable', 'solar', 'wind', 'electricity', 'fuel', 'power']
WASTE_KEYWORDS = ['waste', 'recycling', 'plastic', 'circular economy', 'disposal', 'landfill']

def detect_contradictions(pdf_text, news_articles):
    """
    Detect contradictions between company claims (PDF) and external reports (news)
    Returns list of contradictions with evidence
    """
    contradictions = []

    # Keywords that indicate strong claims
    claim_keywords = ['committed', 'achieved', 'reduced', 'eliminated', 'carbon neutral', 'net zero', 'sustainable']
    
    # Keywords that indicate environmental context (Strict Physical Terms only)
    # Removed generic words like 'green', 'sustainability', 'environmental' which appear in financial contexts
    env_context = ['climate', 'carbon', 'emission', 'pollution', 'waste', 'biodiversity', 'fossil fuel', 'deforestation', 'ecological']
    
    # Exclude regulators to avoid flagging financial fines as greenwashing
    # (RBI, SEBI, SEC, etc.)
    financial_exclusions = ['rbi', 'sebi', 'sec', 'money laundering', 'insider trading', 'stock market', 'shares', 'quarterly result']

    for article in news_articles:
        # Check if article is relevant to environment before counting it as a contradiction
        text = (article['title'] + " " + article['content']).lower()
        
        # Safety Check: If it mentions financial regulators/crimes, IGNORE even if it says "Green"
        if any(ex in text for ex in financial_exclusions):
            continue

        if not any(k in text for k in env_context):
            continue
            
        for key in claim_keywords:
            if key in text and any(neg in text for neg in ['false', 'misleading', 'investigation', 'lawsuit', 'fine', 'violation']):
               contradictions.append({
                   "claim_type": "Environmental claim questioned",
                   "evidence": article['title'],
                   "source": article['url'],
                   "risk_level": "High"
               })
               break
    
    # Keywords that indicate skepticism or allegations
    skeptic_keywords = ['greenwashing', 'false claims', 'misleading', 'controversy', 'lawsuit', 'allegations']
    
    pdf_lower = pdf_text.lower()
    has_strong_claims = any(keyword in pdf_lower for keyword in claim_keywords)
    
    if has_strong_claims:
        for article in news_articles:
            content_lower = article['content'].lower()
            if any(keyword in content_lower for keyword in skeptic_keywords):
                contradictions.append({
                    "claim_type": "Environmental commitment",
                    "evidence_url": article['url'],
                    "evidence_title": article['title'],
                    "severity": "High"
                })
                
    # New: General Compliance Risk Detection (Not just contradictions)
    # Search for specific legal/compliance keywords in all articles
    compliance_keywords = ['lawsuit', 'fine', 'penalty', 'violation', 'non-compliance', 'EPA', 'investigation', 'fraud', 'illegal']
    for article in news_articles:
        content_lower = article['content'].lower()
        if any(keyword in content_lower for keyword in compliance_keywords):
             contradictions.append({ # Leveraging the same list for now, or could create a separate list
                "claim_type": "Regulatory Compliance Issue",
                "evidence_url": article['url'],
                "evidence_title": article['title'],
                "severity": "Critical"
             })
    
    return contradictions

def detect_hidden_patterns(all_reviews):
    """
    Analyze reviews to find hidden patterns:
    - Sudden changes in sentiment
    - Repeated phrases (astroturfing)
    - Discrepancies between employee and customer reviews
    """
    patterns = []
    
    if len(all_reviews) > 10:
        # Check for repeated phrases (potential fake reviews)
        content_texts = [r['content'][:500] for r in all_reviews]
        unique_ratio = len(set(content_texts)) / len(content_texts)
        
        if unique_ratio < 0.7:
            patterns.append({
                "pattern": "Potential astroturfing detected",
                "description": f"Only {int(unique_ratio*100)}% unique review content - may indicate coordinated posting",
                "severity": "Medium"
            })
    
    # Check for platform discrepancies
    glassdoor_reviews = [r for r in all_reviews if 'glassdoor' in r['url'].lower()]
    reddit_reviews = [r for r in all_reviews if 'reddit' in r['url'].lower()]
    
    if glassdoor_reviews and reddit_reviews:
        patterns.append({
            "pattern": "Multi-platform analysis available",
            "description": f"Found {len(glassdoor_reviews)} Glassdoor and {len(reddit_reviews)} Reddit discussions for cross-validation",
            "severity": "Info"
        })
    
    return patterns

async def analyze_company(company_name: str, pdf_path: str):
    report_progress(f"Starting comprehensive analysis for {company_name}", 5)
    
    # 1. Process PDF
    report_progress("Processing PDF document...", 8)
    pdf_text = extract_text_from_pdf(pdf_path)
    pdf_sentences = split_sentences(pdf_text)
    
    # --- PERPLEXITY AI INTEGRATION ---
    from .perplexity_client import research_company, PERPLEXITY_API_KEY
    pplx_data = None
    
    if PERPLEXITY_API_KEY:
        report_progress("Conducting deep research...", 15)
        pplx_data = research_company(company_name)
        
    # 2. Comprehensive Scraping (ALL available sources)
    # Always run scraping to get real news, even if Perplexity is active
    news_articles = await get_company_news(company_name)
    
    # Optional: We can still use Perplexity findings for internal scoring without displaying them as 'news'
    if pplx_data:
         pass # Findings already in pplx_data for later use
    
    # Progress 50-80% handled by get_company_reviews  
    reviews = await get_company_reviews(company_name)
    
    # Progress 50-80% handled by get_company_reviews  
    reviews = await get_company_reviews(company_name)
    
    # 3. Analyze PDF Content
    report_progress("Analyzing PDF content...", 82)
    pdf_scores = calculate_scores(pdf_sentences)
    
    # 4. Detect Contradictions and Hidden Patterns
    report_progress("Detecting contradictions and patterns...", 85)
    contradictions = detect_contradictions(pdf_text, news_articles)
    hidden_patterns = detect_hidden_patterns(reviews)
    
    # 5. Analyze External Sentiment with ALL data
    report_progress("Analyzing sentiment...", 90)
    news_text = [a['content'] for a in news_articles]
    reviews_text = [r['content'] for r in reviews]
    all_external_text = news_text + reviews_text
    
    news_sentiment = analyze_sentiment(news_text) if news_text else {'label': 'Neutral', 'score': 0.5}
    reviews_sentiment = analyze_sentiment(reviews_text) if reviews_text else {'label': 'Neutral', 'score': 0.5}
    
    # Aspect-based sentiment (REAL SCORES)
    emission_sentiment = analyze_aspect_sentiment(all_external_text, EMISSION_KEYWORDS)
    energy_sentiment = analyze_aspect_sentiment(all_external_text, ENERGY_KEYWORDS)
    waste_sentiment = analyze_aspect_sentiment(all_external_text, WASTE_KEYWORDS)
    
    # 6. Calculate Evidence-Based Score with detailed metrics
    report_progress("Calculating final scores...", 95)
    
    # Calculate detailed scores (REAL METRICS)
    green_keyword_freq = pdf_scores['env_count'] / max(len(pdf_sentences), 1)
    vague_ratio = calculate_vague_score(pdf_sentences)
    concrete_ratio = calculate_concrete_score(pdf_sentences)
    
    # --- IMPROVED SCORING FORMULA ---
    # We now calculate the composite sentiment FIRST and let it drive the external portion of the score.
    # See lines 340+ for where we normally calculated it. We'll do it here to affect the score.
    
    # 1. Internal Sentiment
    internal_sentiment_data = analyze_sentiment(pdf_scores['env_sentences'])
    
    def get_linear_score_local(s_dict):
        # Convert label+confidence to 0-100 scale
        if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) # 50-100
        if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) # 0-50
        return 50 # Neutral

    s_int = get_linear_score_local(internal_sentiment_data)
    s_ext = get_linear_score_local(news_sentiment)
    s_rev = get_linear_score_local(reviews_sentiment)
    
    # 2. Composite Sentiment Score (0-100)
    # 35% Internal (What they say) + 45% External (News) + 20% Reviews (Employee/Public)
    composite_score_val = (s_int * 0.35) + (s_ext * 0.45) + (s_rev * 0.20)
    
    # 3. Base Score Calculation
    # We blend the Composite Sentiment (Qualitative) with Concrete Data (Quantitative)
    
    # Start with the Sentiment Score (0-100)
    final_score = composite_score_val
    
    # Adjust based on Concrete Data (The "Proof")
    # If they have high concrete data, boost the score.
    # If they have high vague language, penalize the score.
    
    score_modifier = 0
    score_modifier += min(concrete_ratio * 100, 25) # Up to +25 points for concrete data
    score_modifier -= min(vague_ratio * 50, 20)     # Up to -20 points for vague language
    
    # Apply modifier
    final_score += score_modifier
    
    # Contradiction Penalty (Facts Check)
    if contradictions:
        # Heavily penalize for contradictions
        final_score -= (len(contradictions) * 15)
        
    # Cap at 0-100
    final_score = max(0, min(100, final_score))

    
    # Calculate external sentiment gap
    ext_gap = abs(news_sentiment['score'] - reviews_sentiment['score'])
        # Determine label
    if final_score >= 80: label = "Excellent"
    elif final_score >= 60: label = "Good"
    elif final_score >= 40: label = "Average"
    elif final_score >= 20: label = "At Risk"
    else: label = "Greenwashing"
    
    # Determine risk level (3-State System)
    # 2 = Greenwashing (High/Critical)
    # 1 = At Risk (Medium)
    # 0 = No Risk (Low)
    risk_level_code = 0
    risk_reasons = []

    # 1. Contradictions (Immediate Greenwashing)
    if contradictions:
        risk_level_code = 2
        risk_reasons.append("External contradictions found")

    # 2. Score Thresholds
    if final_score < 40:
        risk_level_code = max(risk_level_code, 2)
        risk_reasons.append(f"Critical Sustainability Score ({int(final_score)}/100)")
    elif final_score < 60:
        risk_level_code = max(risk_level_code, 1) # At Risk
        
    # 3. Vague Language
    if vague_ratio > 0.50 and concrete_ratio < 0.10:
        risk_level_code = 2
        risk_reasons.append("Excessive vague language")
    elif vague_ratio > 0.40 and concrete_ratio < 0.20:
        risk_level_code = max(risk_level_code, 1) # At Risk

    # 4. Empty Claims
    if news_sentiment['label'] == 'Positive' and concrete_ratio < 0.01:
        risk_level_code = 2
        risk_reasons.append("Positive press without concrete data")
    
    # --- SAFE HARBOR OVERRIDE ---
    high_risk_industries = ['coal', 'oil', 'petroleum', 'mining', 'gas', 'cement', 'steel', 'tobacco', 'power', 'thermal', 'adani']
    is_high_risk = any(ind in company_name.lower() for ind in high_risk_industries)
    
    pass_safe_harbor = False
    if concrete_ratio > 0.05 and len(contradictions) < 2:
        if is_high_risk:
             if concrete_ratio > 0.20 and emission_sentiment['label'] == 'Positive':
                 pass_safe_harbor = True
             else:
                 if risk_level_code < 2:
                     risk_level_code = 2
                     risk_reasons.append("High Risk Industry without exceptional mitigation")
        elif emission_sentiment['label'] != 'Negative':
             pass_safe_harbor = True
             
    if pass_safe_harbor:
        risk_level_code = 0 # Force No Risk
        if risk_reasons:
             risk_reasons = [f"Risk Mitigated: Sufficient concrete data ({round(concrete_ratio*100, 1)}%) provided."]
        print(f"SAFE HARBOR TRIGGERED for {company_name}")

    # Map code to string
    # IMPACT: User requested specific labels
    if risk_level_code == 2:
        overall_risk_str = "Greenwashing"
        greenwashing_flag = 1
    elif risk_level_code == 1:
        overall_risk_str = "At Risk"
        greenwashing_flag = 0 # It's not "Greenwashing" per se, just risky? Or should flag be 1? 
        # Typically "Greenwashing" flag is binary for UI warnings, but let's keep it 1 only for High.
    else:
        overall_risk_str = "No Risk"
        greenwashing_flag = 0

    # Update reasons into result
    if risk_reasons and risk_level_code >= 1:
        pdf_scores['env_sentences'] = [f"[RISK] {r}" for r in risk_reasons] + pdf_scores['env_sentences']
    
    # --- AI RECOMMENDATIONS & DESCRIPTION GENERATION ---
    company_description = ""
    ai_recommendations = {}
    
    if pplx_data:
        report_progress("Using insights...", 95)
        company_description = pplx_data.get("description", "Description unavailable.")
        ai_recommendations = pplx_data.get("recommendations", {})
    else:
        # Fallback to Gemini or defaults
        try:
            from .llm_generator import generate_company_description, generate_ai_recommendations
            report_progress("Generating insights...", 98)
            company_description = generate_company_description(company_name)
            
            pre_result = {
                "greenwashingLabel": greenwashing_flag,
                "internal_documents_analysis": {"major_findings": pdf_scores['env_sentences'][:1]},
                "contradictions_detected": contradictions,
                "external_summary": {"public_sentiment": news_sentiment['label']}
            }
            ai_recommendations = generate_ai_recommendations(company_name, pre_result)
        except Exception as e:
            print(f"AI Generation fallback failed: {e}")
            company_description = f"Analysis of {company_name}'s sustainability practices."
            ai_recommendations = {
                "customers": ["Review sustainability claims"],
                "investors": ["Monitor ESG disclosures"],
                "regulators": ["Standard compliance checks"]
            }
    
    # --- COMPOSITE SENTIMENT SCORE ---
    # (calculation remains same)
    internal_sentiment = analyze_sentiment(pdf_scores['env_sentences'])
    
    def get_linear_score(s_dict):
        if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50)
        if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50)
        return 50 # Neutral
        
    int_s = get_linear_score(internal_sentiment)
    ext_s = get_linear_score(news_sentiment)
    rev_s = get_linear_score(reviews_sentiment)
    
    composite_score = (int_s * 0.4) + (ext_s * 0.4) + (rev_s * 0.2)
    composite_score_norm = composite_score / 100.0

    # (AI generation already done above - using company_description and ai_recommendations)

    # Update result
    result = {
        "company_name": company_name,
        "company_description": company_description, 
        "last_updated": datetime.now().isoformat(),
        "confidence_score": f"High ({len(news_articles) + len(reviews)} sources analyzed)",
        "greenwashingLabel": greenwashing_flag, # 1 if Greenwashing, else 0 (Simplification for some binary UIs)
        
        "detailed_scores": {
            "green_keyword_frequency": round(green_keyword_freq, 3),
            "vague_keyword_ratio": round(vague_ratio, 3),
            "concrete_claim_ratio": round(concrete_ratio, 3),
            "overall_sentiment": round(composite_score_norm, 3), 
            "internal_sentiment": round(internal_sentiment['score'], 3),
            "external_sentiment": round(news_sentiment['score'], 3),
            "external_sentiment_gap": round(ext_gap, 3),
            "emission_sentiment": round(emission_sentiment['score'], 3),
            "energy_sentiment": round(energy_sentiment['score'], 3),
            "waste_sentiment": round(waste_sentiment['score'], 3),
            "relative_focus_score": round(pdf_scores['env_count'] / max(len(pdf_sentences), 1), 3)
        },
        
        "external_summary": {
            "key_highlights": [
                f"Public Sentiment: {news_sentiment['label']}", 
                f"Risk Level: {overall_risk_str}"
            ],
            # ...
            "public_sentiment": news_sentiment['label'],
            "recent_news_summary": f"Analysis of {len(news_articles)} articles.",
            "possible_bias": "None",
            "evidence_links": news_articles[:5]
        },
        
        "internal_documents_analysis": {
            "major_findings": pdf_scores['env_sentences'][:5],
            "compliance_risks": [f"Potential risk: {s[:50]}..." for s in pdf_scores['env_sentences'] if "aims to" in s][:3],
            "performance_indicators": [s for s in pdf_scores['action_sentences'] if "%" in s][:5]
        },
        
        "risk_assessment": {
            "financial_risk": "High" if risk_level_code == 2 else "Low", 
            "reputation_risk": "Critical" if risk_level_code == 2 else ("Medium" if risk_level_code == 1 else "Low"),
            "compliance_risk": "High" if risk_level_code == 2 else "Low",
            "market_risk": "Medium" if final_score < 50 else "Low",
            # IMPACT: 3-State Output
            "overall_risk_level": overall_risk_str
        },
        
        # ... (rest same) ...
        "opportunities_and_strengths": [
             "Expand concrete data reporting",
             "Address external contradictions explicitly"
        ] if risk_level_code >= 1 else [
             "Strong concrete data transparency",
             "Positive external sentiment alignment"
        ],
        
        "reviews_analysis": {
            "sentiment_score": reviews_sentiment['score'], 
            "total_reviews_analyzed": len(reviews),
            "review_sources": reviews[:5] 
        },
        
        "recommended_actions": ai_recommendations,
        
        "hidden_patterns": [
            {"pattern": "Vague Language", "description": "High usage of 'aims to' without dates"}
        ] if vague_ratio > 0.4 else []
    }
    
    report_progress(f"Analysis complete: Score {final_score}/100", 100)
    return result