from datetime import datetime from .pdf_processor import extract_text_from_pdf, split_sentences, clean_text from .scraper import get_company_news, get_company_reviews, report_progress from .scoring import calculate_scores, analyze_sentiment, analyze_aspect_sentiment, calculate_vague_score, calculate_concrete_score from .llm_generator import generate_company_description, generate_ai_recommendations # Aspect Keywords EMISSION_KEYWORDS = ['emission', 'carbon', 'co2', 'greenhouse', 'pollution', 'net zero', 'carbon neutral'] ENERGY_KEYWORDS = ['energy', 'renewable', 'solar', 'wind', 'electricity', 'fuel', 'power'] WASTE_KEYWORDS = ['waste', 'recycling', 'plastic', 'circular economy', 'disposal', 'landfill'] def detect_contradictions(pdf_text, news_articles): """ Detect contradictions between company claims (PDF) and external reports (news) Returns list of contradictions with evidence """ contradictions = [] # Keywords that indicate strong claims claim_keywords = ['committed', 'achieved', 'reduced', 'eliminated', 'carbon neutral', 'net zero', 'sustainable'] # Keywords that indicate environmental context (Strict Physical Terms only) # Removed generic words like 'green', 'sustainability', 'environmental' which appear in financial contexts env_context = ['climate', 'carbon', 'emission', 'pollution', 'waste', 'biodiversity', 'fossil fuel', 'deforestation', 'ecological'] # Exclude regulators to avoid flagging financial fines as greenwashing # (RBI, SEBI, SEC, etc.) financial_exclusions = ['rbi', 'sebi', 'sec', 'money laundering', 'insider trading', 'stock market', 'shares', 'quarterly result'] for article in news_articles: # Check if article is relevant to environment before counting it as a contradiction text = (article['title'] + " " + article['content']).lower() # Safety Check: If it mentions financial regulators/crimes, IGNORE even if it says "Green" if any(ex in text for ex in financial_exclusions): continue if not any(k in text for k in env_context): continue for key in claim_keywords: if key in text and any(neg in text for neg in ['false', 'misleading', 'investigation', 'lawsuit', 'fine', 'violation']): contradictions.append({ "claim_type": "Environmental claim questioned", "evidence": article['title'], "source": article['url'], "risk_level": "High" }) break # Keywords that indicate skepticism or allegations skeptic_keywords = ['greenwashing', 'false claims', 'misleading', 'controversy', 'lawsuit', 'allegations'] pdf_lower = pdf_text.lower() has_strong_claims = any(keyword in pdf_lower for keyword in claim_keywords) if has_strong_claims: for article in news_articles: content_lower = article['content'].lower() if any(keyword in content_lower for keyword in skeptic_keywords): contradictions.append({ "claim_type": "Environmental commitment", "evidence_url": article['url'], "evidence_title": article['title'], "severity": "High" }) # New: General Compliance Risk Detection (Not just contradictions) # Search for specific legal/compliance keywords in all articles compliance_keywords = ['lawsuit', 'fine', 'penalty', 'violation', 'non-compliance', 'EPA', 'investigation', 'fraud', 'illegal'] for article in news_articles: content_lower = article['content'].lower() if any(keyword in content_lower for keyword in compliance_keywords): contradictions.append({ # Leveraging the same list for now, or could create a separate list "claim_type": "Regulatory Compliance Issue", "evidence_url": article['url'], "evidence_title": article['title'], "severity": "Critical" }) return contradictions def detect_hidden_patterns(all_reviews): """ Analyze reviews to find hidden patterns: - Sudden changes in sentiment - Repeated phrases (astroturfing) - Discrepancies between employee and customer reviews """ patterns = [] if len(all_reviews) > 10: # Check for repeated phrases (potential fake reviews) content_texts = [r['content'][:500] for r in all_reviews] unique_ratio = len(set(content_texts)) / len(content_texts) if unique_ratio < 0.7: patterns.append({ "pattern": "Potential astroturfing detected", "description": f"Only {int(unique_ratio*100)}% unique review content - may indicate coordinated posting", "severity": "Medium" }) # Check for platform discrepancies glassdoor_reviews = [r for r in all_reviews if 'glassdoor' in r['url'].lower()] reddit_reviews = [r for r in all_reviews if 'reddit' in r['url'].lower()] if glassdoor_reviews and reddit_reviews: patterns.append({ "pattern": "Multi-platform analysis available", "description": f"Found {len(glassdoor_reviews)} Glassdoor and {len(reddit_reviews)} Reddit discussions for cross-validation", "severity": "Info" }) return patterns async def analyze_company(company_name: str, pdf_path: str): report_progress(f"Starting comprehensive analysis for {company_name}", 5) # 1. Process PDF report_progress("Processing PDF document...", 8) pdf_text = extract_text_from_pdf(pdf_path) pdf_sentences = split_sentences(pdf_text) # --- PERPLEXITY AI INTEGRATION --- from .perplexity_client import research_company, PERPLEXITY_API_KEY pplx_data = None if PERPLEXITY_API_KEY: report_progress("Conducting deep research...", 15) pplx_data = research_company(company_name) # 2. Comprehensive Scraping (ALL available sources) # Always run scraping to get real news, even if Perplexity is active news_articles = await get_company_news(company_name) # Optional: We can still use Perplexity findings for internal scoring without displaying them as 'news' if pplx_data: pass # Findings already in pplx_data for later use # Progress 50-80% handled by get_company_reviews reviews = await get_company_reviews(company_name) # Progress 50-80% handled by get_company_reviews reviews = await get_company_reviews(company_name) # 3. Analyze PDF Content report_progress("Analyzing PDF content...", 82) pdf_scores = calculate_scores(pdf_sentences) # 4. Detect Contradictions and Hidden Patterns report_progress("Detecting contradictions and patterns...", 85) contradictions = detect_contradictions(pdf_text, news_articles) hidden_patterns = detect_hidden_patterns(reviews) # 5. Analyze External Sentiment with ALL data report_progress("Analyzing sentiment...", 90) news_text = [a['content'] for a in news_articles] reviews_text = [r['content'] for r in reviews] all_external_text = news_text + reviews_text news_sentiment = analyze_sentiment(news_text) if news_text else {'label': 'Neutral', 'score': 0.5} reviews_sentiment = analyze_sentiment(reviews_text) if reviews_text else {'label': 'Neutral', 'score': 0.5} # Aspect-based sentiment (REAL SCORES) emission_sentiment = analyze_aspect_sentiment(all_external_text, EMISSION_KEYWORDS) energy_sentiment = analyze_aspect_sentiment(all_external_text, ENERGY_KEYWORDS) waste_sentiment = analyze_aspect_sentiment(all_external_text, WASTE_KEYWORDS) # 6. Calculate Evidence-Based Score with detailed metrics report_progress("Calculating final scores...", 95) # Calculate detailed scores (REAL METRICS) green_keyword_freq = pdf_scores['env_count'] / max(len(pdf_sentences), 1) vague_ratio = calculate_vague_score(pdf_sentences) concrete_ratio = calculate_concrete_score(pdf_sentences) # --- IMPROVED SCORING FORMULA --- # We now calculate the composite sentiment FIRST and let it drive the external portion of the score. # See lines 340+ for where we normally calculated it. We'll do it here to affect the score. # 1. Internal Sentiment internal_sentiment_data = analyze_sentiment(pdf_scores['env_sentences']) def get_linear_score_local(s_dict): # Convert label+confidence to 0-100 scale if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) # 50-100 if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) # 0-50 return 50 # Neutral s_int = get_linear_score_local(internal_sentiment_data) s_ext = get_linear_score_local(news_sentiment) s_rev = get_linear_score_local(reviews_sentiment) # 2. Composite Sentiment Score (0-100) # 35% Internal (What they say) + 45% External (News) + 20% Reviews (Employee/Public) composite_score_val = (s_int * 0.35) + (s_ext * 0.45) + (s_rev * 0.20) # 3. Base Score Calculation # We blend the Composite Sentiment (Qualitative) with Concrete Data (Quantitative) # Start with the Sentiment Score (0-100) final_score = composite_score_val # Adjust based on Concrete Data (The "Proof") # If they have high concrete data, boost the score. # If they have high vague language, penalize the score. score_modifier = 0 score_modifier += min(concrete_ratio * 100, 25) # Up to +25 points for concrete data score_modifier -= min(vague_ratio * 50, 20) # Up to -20 points for vague language # Apply modifier final_score += score_modifier # Contradiction Penalty (Facts Check) if contradictions: # Heavily penalize for contradictions final_score -= (len(contradictions) * 15) # Cap at 0-100 final_score = max(0, min(100, final_score)) # Calculate external sentiment gap ext_gap = abs(news_sentiment['score'] - reviews_sentiment['score']) # Determine label if final_score >= 80: label = "Excellent" elif final_score >= 60: label = "Good" elif final_score >= 40: label = "Average" elif final_score >= 20: label = "At Risk" else: label = "Greenwashing" # Determine risk level (3-State System) # 2 = Greenwashing (High/Critical) # 1 = At Risk (Medium) # 0 = No Risk (Low) risk_level_code = 0 risk_reasons = [] # 1. Contradictions (Immediate Greenwashing) if contradictions: risk_level_code = 2 risk_reasons.append("External contradictions found") # 2. Score Thresholds if final_score < 40: risk_level_code = max(risk_level_code, 2) risk_reasons.append(f"Critical Sustainability Score ({int(final_score)}/100)") elif final_score < 60: risk_level_code = max(risk_level_code, 1) # At Risk # 3. Vague Language if vague_ratio > 0.50 and concrete_ratio < 0.10: risk_level_code = 2 risk_reasons.append("Excessive vague language") elif vague_ratio > 0.40 and concrete_ratio < 0.20: risk_level_code = max(risk_level_code, 1) # At Risk # 4. Empty Claims if news_sentiment['label'] == 'Positive' and concrete_ratio < 0.01: risk_level_code = 2 risk_reasons.append("Positive press without concrete data") # --- SAFE HARBOR OVERRIDE --- high_risk_industries = ['coal', 'oil', 'petroleum', 'mining', 'gas', 'cement', 'steel', 'tobacco', 'power', 'thermal', 'adani'] is_high_risk = any(ind in company_name.lower() for ind in high_risk_industries) pass_safe_harbor = False if concrete_ratio > 0.05 and len(contradictions) < 2: if is_high_risk: if concrete_ratio > 0.20 and emission_sentiment['label'] == 'Positive': pass_safe_harbor = True else: if risk_level_code < 2: risk_level_code = 2 risk_reasons.append("High Risk Industry without exceptional mitigation") elif emission_sentiment['label'] != 'Negative': pass_safe_harbor = True if pass_safe_harbor: risk_level_code = 0 # Force No Risk if risk_reasons: risk_reasons = [f"Risk Mitigated: Sufficient concrete data ({round(concrete_ratio*100, 1)}%) provided."] print(f"SAFE HARBOR TRIGGERED for {company_name}") # Map code to string # IMPACT: User requested specific labels if risk_level_code == 2: overall_risk_str = "Greenwashing" greenwashing_flag = 1 elif risk_level_code == 1: overall_risk_str = "At Risk" greenwashing_flag = 0 # It's not "Greenwashing" per se, just risky? Or should flag be 1? # Typically "Greenwashing" flag is binary for UI warnings, but let's keep it 1 only for High. else: overall_risk_str = "No Risk" greenwashing_flag = 0 # Update reasons into result if risk_reasons and risk_level_code >= 1: pdf_scores['env_sentences'] = [f"[RISK] {r}" for r in risk_reasons] + pdf_scores['env_sentences'] # --- AI RECOMMENDATIONS & DESCRIPTION GENERATION --- company_description = "" ai_recommendations = {} if pplx_data: report_progress("Using insights...", 95) company_description = pplx_data.get("description", "Description unavailable.") ai_recommendations = pplx_data.get("recommendations", {}) else: # Fallback to Gemini or defaults try: from .llm_generator import generate_company_description, generate_ai_recommendations report_progress("Generating insights...", 98) company_description = generate_company_description(company_name) pre_result = { "greenwashingLabel": greenwashing_flag, "internal_documents_analysis": {"major_findings": pdf_scores['env_sentences'][:1]}, "contradictions_detected": contradictions, "external_summary": {"public_sentiment": news_sentiment['label']} } ai_recommendations = generate_ai_recommendations(company_name, pre_result) except Exception as e: print(f"AI Generation fallback failed: {e}") company_description = f"Analysis of {company_name}'s sustainability practices." ai_recommendations = { "customers": ["Review sustainability claims"], "investors": ["Monitor ESG disclosures"], "regulators": ["Standard compliance checks"] } # --- COMPOSITE SENTIMENT SCORE --- # (calculation remains same) internal_sentiment = analyze_sentiment(pdf_scores['env_sentences']) def get_linear_score(s_dict): if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) return 50 # Neutral int_s = get_linear_score(internal_sentiment) ext_s = get_linear_score(news_sentiment) rev_s = get_linear_score(reviews_sentiment) composite_score = (int_s * 0.4) + (ext_s * 0.4) + (rev_s * 0.2) composite_score_norm = composite_score / 100.0 # (AI generation already done above - using company_description and ai_recommendations) # Update result result = { "company_name": company_name, "company_description": company_description, "last_updated": datetime.now().isoformat(), "confidence_score": f"High ({len(news_articles) + len(reviews)} sources analyzed)", "greenwashingLabel": greenwashing_flag, # 1 if Greenwashing, else 0 (Simplification for some binary UIs) "detailed_scores": { "green_keyword_frequency": round(green_keyword_freq, 3), "vague_keyword_ratio": round(vague_ratio, 3), "concrete_claim_ratio": round(concrete_ratio, 3), "overall_sentiment": round(composite_score_norm, 3), "internal_sentiment": round(internal_sentiment['score'], 3), "external_sentiment": round(news_sentiment['score'], 3), "external_sentiment_gap": round(ext_gap, 3), "emission_sentiment": round(emission_sentiment['score'], 3), "energy_sentiment": round(energy_sentiment['score'], 3), "waste_sentiment": round(waste_sentiment['score'], 3), "relative_focus_score": round(pdf_scores['env_count'] / max(len(pdf_sentences), 1), 3) }, "external_summary": { "key_highlights": [ f"Public Sentiment: {news_sentiment['label']}", f"Risk Level: {overall_risk_str}" ], # ... "public_sentiment": news_sentiment['label'], "recent_news_summary": f"Analysis of {len(news_articles)} articles.", "possible_bias": "None", "evidence_links": news_articles[:5] }, "internal_documents_analysis": { "major_findings": pdf_scores['env_sentences'][:5], "compliance_risks": [f"Potential risk: {s[:50]}..." for s in pdf_scores['env_sentences'] if "aims to" in s][:3], "performance_indicators": [s for s in pdf_scores['action_sentences'] if "%" in s][:5] }, "risk_assessment": { "financial_risk": "High" if risk_level_code == 2 else "Low", "reputation_risk": "Critical" if risk_level_code == 2 else ("Medium" if risk_level_code == 1 else "Low"), "compliance_risk": "High" if risk_level_code == 2 else "Low", "market_risk": "Medium" if final_score < 50 else "Low", # IMPACT: 3-State Output "overall_risk_level": overall_risk_str }, # ... (rest same) ... "opportunities_and_strengths": [ "Expand concrete data reporting", "Address external contradictions explicitly" ] if risk_level_code >= 1 else [ "Strong concrete data transparency", "Positive external sentiment alignment" ], "reviews_analysis": { "sentiment_score": reviews_sentiment['score'], "total_reviews_analyzed": len(reviews), "review_sources": reviews[:5] }, "recommended_actions": ai_recommendations, "hidden_patterns": [ {"pattern": "Vague Language", "description": "High usage of 'aims to' without dates"} ] if vague_ratio > 0.4 else [] } report_progress(f"Analysis complete: Score {final_score}/100", 100) return result