Spaces:
Sleeping
Sleeping
| from datetime import datetime | |
| from .pdf_processor import extract_text_from_pdf, split_sentences, clean_text | |
| from .scraper import get_company_news, get_company_reviews, report_progress | |
| from .scoring import calculate_scores, analyze_sentiment, analyze_aspect_sentiment, calculate_vague_score, calculate_concrete_score | |
| from .llm_generator import generate_company_description, generate_ai_recommendations | |
| # Aspect Keywords | |
| EMISSION_KEYWORDS = ['emission', 'carbon', 'co2', 'greenhouse', 'pollution', 'net zero', 'carbon neutral'] | |
| ENERGY_KEYWORDS = ['energy', 'renewable', 'solar', 'wind', 'electricity', 'fuel', 'power'] | |
| WASTE_KEYWORDS = ['waste', 'recycling', 'plastic', 'circular economy', 'disposal', 'landfill'] | |
| def detect_contradictions(pdf_text, news_articles): | |
| """ | |
| Detect contradictions between company claims (PDF) and external reports (news) | |
| Returns list of contradictions with evidence | |
| """ | |
| contradictions = [] | |
| # Keywords that indicate strong claims | |
| claim_keywords = ['committed', 'achieved', 'reduced', 'eliminated', 'carbon neutral', 'net zero', 'sustainable'] | |
| # Keywords that indicate environmental context (Strict Physical Terms only) | |
| # Removed generic words like 'green', 'sustainability', 'environmental' which appear in financial contexts | |
| env_context = ['climate', 'carbon', 'emission', 'pollution', 'waste', 'biodiversity', 'fossil fuel', 'deforestation', 'ecological'] | |
| # Exclude regulators to avoid flagging financial fines as greenwashing | |
| # (RBI, SEBI, SEC, etc.) | |
| financial_exclusions = ['rbi', 'sebi', 'sec', 'money laundering', 'insider trading', 'stock market', 'shares', 'quarterly result'] | |
| for article in news_articles: | |
| # Check if article is relevant to environment before counting it as a contradiction | |
| text = (article['title'] + " " + article['content']).lower() | |
| # Safety Check: If it mentions financial regulators/crimes, IGNORE even if it says "Green" | |
| if any(ex in text for ex in financial_exclusions): | |
| continue | |
| if not any(k in text for k in env_context): | |
| continue | |
| for key in claim_keywords: | |
| if key in text and any(neg in text for neg in ['false', 'misleading', 'investigation', 'lawsuit', 'fine', 'violation']): | |
| contradictions.append({ | |
| "claim_type": "Environmental claim questioned", | |
| "evidence": article['title'], | |
| "source": article['url'], | |
| "risk_level": "High" | |
| }) | |
| break | |
| # Keywords that indicate skepticism or allegations | |
| skeptic_keywords = ['greenwashing', 'false claims', 'misleading', 'controversy', 'lawsuit', 'allegations'] | |
| pdf_lower = pdf_text.lower() | |
| has_strong_claims = any(keyword in pdf_lower for keyword in claim_keywords) | |
| if has_strong_claims: | |
| for article in news_articles: | |
| content_lower = article['content'].lower() | |
| if any(keyword in content_lower for keyword in skeptic_keywords): | |
| contradictions.append({ | |
| "claim_type": "Environmental commitment", | |
| "evidence_url": article['url'], | |
| "evidence_title": article['title'], | |
| "severity": "High" | |
| }) | |
| # New: General Compliance Risk Detection (Not just contradictions) | |
| # Search for specific legal/compliance keywords in all articles | |
| compliance_keywords = ['lawsuit', 'fine', 'penalty', 'violation', 'non-compliance', 'EPA', 'investigation', 'fraud', 'illegal'] | |
| for article in news_articles: | |
| content_lower = article['content'].lower() | |
| if any(keyword in content_lower for keyword in compliance_keywords): | |
| contradictions.append({ # Leveraging the same list for now, or could create a separate list | |
| "claim_type": "Regulatory Compliance Issue", | |
| "evidence_url": article['url'], | |
| "evidence_title": article['title'], | |
| "severity": "Critical" | |
| }) | |
| return contradictions | |
| def detect_hidden_patterns(all_reviews): | |
| """ | |
| Analyze reviews to find hidden patterns: | |
| - Sudden changes in sentiment | |
| - Repeated phrases (astroturfing) | |
| - Discrepancies between employee and customer reviews | |
| """ | |
| patterns = [] | |
| if len(all_reviews) > 10: | |
| # Check for repeated phrases (potential fake reviews) | |
| content_texts = [r['content'][:500] for r in all_reviews] | |
| unique_ratio = len(set(content_texts)) / len(content_texts) | |
| if unique_ratio < 0.7: | |
| patterns.append({ | |
| "pattern": "Potential astroturfing detected", | |
| "description": f"Only {int(unique_ratio*100)}% unique review content - may indicate coordinated posting", | |
| "severity": "Medium" | |
| }) | |
| # Check for platform discrepancies | |
| glassdoor_reviews = [r for r in all_reviews if 'glassdoor' in r['url'].lower()] | |
| reddit_reviews = [r for r in all_reviews if 'reddit' in r['url'].lower()] | |
| if glassdoor_reviews and reddit_reviews: | |
| patterns.append({ | |
| "pattern": "Multi-platform analysis available", | |
| "description": f"Found {len(glassdoor_reviews)} Glassdoor and {len(reddit_reviews)} Reddit discussions for cross-validation", | |
| "severity": "Info" | |
| }) | |
| return patterns | |
| async def analyze_company(company_name: str, pdf_path: str): | |
| report_progress(f"Starting comprehensive analysis for {company_name}", 5) | |
| # 1. Process PDF | |
| report_progress("Processing PDF document...", 8) | |
| pdf_text = extract_text_from_pdf(pdf_path) | |
| pdf_sentences = split_sentences(pdf_text) | |
| # --- PERPLEXITY AI INTEGRATION --- | |
| from .perplexity_client import research_company, PERPLEXITY_API_KEY | |
| pplx_data = None | |
| if PERPLEXITY_API_KEY: | |
| report_progress("Conducting deep research...", 15) | |
| pplx_data = research_company(company_name) | |
| # 2. Comprehensive Scraping (ALL available sources) | |
| # Always run scraping to get real news, even if Perplexity is active | |
| news_articles = await get_company_news(company_name) | |
| # Optional: We can still use Perplexity findings for internal scoring without displaying them as 'news' | |
| if pplx_data: | |
| pass # Findings already in pplx_data for later use | |
| # Progress 50-80% handled by get_company_reviews | |
| reviews = await get_company_reviews(company_name) | |
| # Progress 50-80% handled by get_company_reviews | |
| reviews = await get_company_reviews(company_name) | |
| # 3. Analyze PDF Content | |
| report_progress("Analyzing PDF content...", 82) | |
| pdf_scores = calculate_scores(pdf_sentences) | |
| # 4. Detect Contradictions and Hidden Patterns | |
| report_progress("Detecting contradictions and patterns...", 85) | |
| contradictions = detect_contradictions(pdf_text, news_articles) | |
| hidden_patterns = detect_hidden_patterns(reviews) | |
| # 5. Analyze External Sentiment with ALL data | |
| report_progress("Analyzing sentiment...", 90) | |
| news_text = [a['content'] for a in news_articles] | |
| reviews_text = [r['content'] for r in reviews] | |
| all_external_text = news_text + reviews_text | |
| news_sentiment = analyze_sentiment(news_text) if news_text else {'label': 'Neutral', 'score': 0.5} | |
| reviews_sentiment = analyze_sentiment(reviews_text) if reviews_text else {'label': 'Neutral', 'score': 0.5} | |
| # Aspect-based sentiment (REAL SCORES) | |
| emission_sentiment = analyze_aspect_sentiment(all_external_text, EMISSION_KEYWORDS) | |
| energy_sentiment = analyze_aspect_sentiment(all_external_text, ENERGY_KEYWORDS) | |
| waste_sentiment = analyze_aspect_sentiment(all_external_text, WASTE_KEYWORDS) | |
| # 6. Calculate Evidence-Based Score with detailed metrics | |
| report_progress("Calculating final scores...", 95) | |
| # Calculate detailed scores (REAL METRICS) | |
| green_keyword_freq = pdf_scores['env_count'] / max(len(pdf_sentences), 1) | |
| vague_ratio = calculate_vague_score(pdf_sentences) | |
| concrete_ratio = calculate_concrete_score(pdf_sentences) | |
| # --- IMPROVED SCORING FORMULA --- | |
| # We now calculate the composite sentiment FIRST and let it drive the external portion of the score. | |
| # See lines 340+ for where we normally calculated it. We'll do it here to affect the score. | |
| # 1. Internal Sentiment | |
| internal_sentiment_data = analyze_sentiment(pdf_scores['env_sentences']) | |
| def get_linear_score_local(s_dict): | |
| # Convert label+confidence to 0-100 scale | |
| if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) # 50-100 | |
| if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) # 0-50 | |
| return 50 # Neutral | |
| s_int = get_linear_score_local(internal_sentiment_data) | |
| s_ext = get_linear_score_local(news_sentiment) | |
| s_rev = get_linear_score_local(reviews_sentiment) | |
| # 2. Composite Sentiment Score (0-100) | |
| # 35% Internal (What they say) + 45% External (News) + 20% Reviews (Employee/Public) | |
| composite_score_val = (s_int * 0.35) + (s_ext * 0.45) + (s_rev * 0.20) | |
| # 3. Base Score Calculation | |
| # We blend the Composite Sentiment (Qualitative) with Concrete Data (Quantitative) | |
| # Start with the Sentiment Score (0-100) | |
| final_score = composite_score_val | |
| # Adjust based on Concrete Data (The "Proof") | |
| # If they have high concrete data, boost the score. | |
| # If they have high vague language, penalize the score. | |
| score_modifier = 0 | |
| score_modifier += min(concrete_ratio * 100, 25) # Up to +25 points for concrete data | |
| score_modifier -= min(vague_ratio * 50, 20) # Up to -20 points for vague language | |
| # Apply modifier | |
| final_score += score_modifier | |
| # Contradiction Penalty (Facts Check) | |
| if contradictions: | |
| # Heavily penalize for contradictions | |
| final_score -= (len(contradictions) * 15) | |
| # Cap at 0-100 | |
| final_score = max(0, min(100, final_score)) | |
| # Calculate external sentiment gap | |
| ext_gap = abs(news_sentiment['score'] - reviews_sentiment['score']) | |
| # Determine label | |
| if final_score >= 80: label = "Excellent" | |
| elif final_score >= 60: label = "Good" | |
| elif final_score >= 40: label = "Average" | |
| elif final_score >= 20: label = "At Risk" | |
| else: label = "Greenwashing" | |
| # Determine risk level (3-State System) | |
| # 2 = Greenwashing (High/Critical) | |
| # 1 = At Risk (Medium) | |
| # 0 = No Risk (Low) | |
| risk_level_code = 0 | |
| risk_reasons = [] | |
| # 1. Contradictions (Immediate Greenwashing) | |
| if contradictions: | |
| risk_level_code = 2 | |
| risk_reasons.append("External contradictions found") | |
| # 2. Score Thresholds | |
| if final_score < 40: | |
| risk_level_code = max(risk_level_code, 2) | |
| risk_reasons.append(f"Critical Sustainability Score ({int(final_score)}/100)") | |
| elif final_score < 60: | |
| risk_level_code = max(risk_level_code, 1) # At Risk | |
| # 3. Vague Language | |
| if vague_ratio > 0.50 and concrete_ratio < 0.10: | |
| risk_level_code = 2 | |
| risk_reasons.append("Excessive vague language") | |
| elif vague_ratio > 0.40 and concrete_ratio < 0.20: | |
| risk_level_code = max(risk_level_code, 1) # At Risk | |
| # 4. Empty Claims | |
| if news_sentiment['label'] == 'Positive' and concrete_ratio < 0.01: | |
| risk_level_code = 2 | |
| risk_reasons.append("Positive press without concrete data") | |
| # --- SAFE HARBOR OVERRIDE --- | |
| high_risk_industries = ['coal', 'oil', 'petroleum', 'mining', 'gas', 'cement', 'steel', 'tobacco', 'power', 'thermal', 'adani'] | |
| is_high_risk = any(ind in company_name.lower() for ind in high_risk_industries) | |
| pass_safe_harbor = False | |
| if concrete_ratio > 0.05 and len(contradictions) < 2: | |
| if is_high_risk: | |
| if concrete_ratio > 0.20 and emission_sentiment['label'] == 'Positive': | |
| pass_safe_harbor = True | |
| else: | |
| if risk_level_code < 2: | |
| risk_level_code = 2 | |
| risk_reasons.append("High Risk Industry without exceptional mitigation") | |
| elif emission_sentiment['label'] != 'Negative': | |
| pass_safe_harbor = True | |
| if pass_safe_harbor: | |
| risk_level_code = 0 # Force No Risk | |
| if risk_reasons: | |
| risk_reasons = [f"Risk Mitigated: Sufficient concrete data ({round(concrete_ratio*100, 1)}%) provided."] | |
| print(f"SAFE HARBOR TRIGGERED for {company_name}") | |
| # Map code to string | |
| # IMPACT: User requested specific labels | |
| if risk_level_code == 2: | |
| overall_risk_str = "Greenwashing" | |
| greenwashing_flag = 1 | |
| elif risk_level_code == 1: | |
| overall_risk_str = "At Risk" | |
| greenwashing_flag = 0 # It's not "Greenwashing" per se, just risky? Or should flag be 1? | |
| # Typically "Greenwashing" flag is binary for UI warnings, but let's keep it 1 only for High. | |
| else: | |
| overall_risk_str = "No Risk" | |
| greenwashing_flag = 0 | |
| # Update reasons into result | |
| if risk_reasons and risk_level_code >= 1: | |
| pdf_scores['env_sentences'] = [f"[RISK] {r}" for r in risk_reasons] + pdf_scores['env_sentences'] | |
| # --- AI RECOMMENDATIONS & DESCRIPTION GENERATION --- | |
| company_description = "" | |
| ai_recommendations = {} | |
| if pplx_data: | |
| report_progress("Using insights...", 95) | |
| company_description = pplx_data.get("description", "Description unavailable.") | |
| ai_recommendations = pplx_data.get("recommendations", {}) | |
| else: | |
| # Fallback to Gemini or defaults | |
| try: | |
| from .llm_generator import generate_company_description, generate_ai_recommendations | |
| report_progress("Generating insights...", 98) | |
| company_description = generate_company_description(company_name) | |
| pre_result = { | |
| "greenwashingLabel": greenwashing_flag, | |
| "internal_documents_analysis": {"major_findings": pdf_scores['env_sentences'][:1]}, | |
| "contradictions_detected": contradictions, | |
| "external_summary": {"public_sentiment": news_sentiment['label']} | |
| } | |
| ai_recommendations = generate_ai_recommendations(company_name, pre_result) | |
| except Exception as e: | |
| print(f"AI Generation fallback failed: {e}") | |
| company_description = f"Analysis of {company_name}'s sustainability practices." | |
| ai_recommendations = { | |
| "customers": ["Review sustainability claims"], | |
| "investors": ["Monitor ESG disclosures"], | |
| "regulators": ["Standard compliance checks"] | |
| } | |
| # --- COMPOSITE SENTIMENT SCORE --- | |
| # (calculation remains same) | |
| internal_sentiment = analyze_sentiment(pdf_scores['env_sentences']) | |
| def get_linear_score(s_dict): | |
| if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) | |
| if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) | |
| return 50 # Neutral | |
| int_s = get_linear_score(internal_sentiment) | |
| ext_s = get_linear_score(news_sentiment) | |
| rev_s = get_linear_score(reviews_sentiment) | |
| composite_score = (int_s * 0.4) + (ext_s * 0.4) + (rev_s * 0.2) | |
| composite_score_norm = composite_score / 100.0 | |
| # (AI generation already done above - using company_description and ai_recommendations) | |
| # Update result | |
| result = { | |
| "company_name": company_name, | |
| "company_description": company_description, | |
| "last_updated": datetime.now().isoformat(), | |
| "confidence_score": f"High ({len(news_articles) + len(reviews)} sources analyzed)", | |
| "greenwashingLabel": greenwashing_flag, # 1 if Greenwashing, else 0 (Simplification for some binary UIs) | |
| "detailed_scores": { | |
| "green_keyword_frequency": round(green_keyword_freq, 3), | |
| "vague_keyword_ratio": round(vague_ratio, 3), | |
| "concrete_claim_ratio": round(concrete_ratio, 3), | |
| "overall_sentiment": round(composite_score_norm, 3), | |
| "internal_sentiment": round(internal_sentiment['score'], 3), | |
| "external_sentiment": round(news_sentiment['score'], 3), | |
| "external_sentiment_gap": round(ext_gap, 3), | |
| "emission_sentiment": round(emission_sentiment['score'], 3), | |
| "energy_sentiment": round(energy_sentiment['score'], 3), | |
| "waste_sentiment": round(waste_sentiment['score'], 3), | |
| "relative_focus_score": round(pdf_scores['env_count'] / max(len(pdf_sentences), 1), 3) | |
| }, | |
| "external_summary": { | |
| "key_highlights": [ | |
| f"Public Sentiment: {news_sentiment['label']}", | |
| f"Risk Level: {overall_risk_str}" | |
| ], | |
| # ... | |
| "public_sentiment": news_sentiment['label'], | |
| "recent_news_summary": f"Analysis of {len(news_articles)} articles.", | |
| "possible_bias": "None", | |
| "evidence_links": news_articles[:5] | |
| }, | |
| "internal_documents_analysis": { | |
| "major_findings": pdf_scores['env_sentences'][:5], | |
| "compliance_risks": [f"Potential risk: {s[:50]}..." for s in pdf_scores['env_sentences'] if "aims to" in s][:3], | |
| "performance_indicators": [s for s in pdf_scores['action_sentences'] if "%" in s][:5] | |
| }, | |
| "risk_assessment": { | |
| "financial_risk": "High" if risk_level_code == 2 else "Low", | |
| "reputation_risk": "Critical" if risk_level_code == 2 else ("Medium" if risk_level_code == 1 else "Low"), | |
| "compliance_risk": "High" if risk_level_code == 2 else "Low", | |
| "market_risk": "Medium" if final_score < 50 else "Low", | |
| # IMPACT: 3-State Output | |
| "overall_risk_level": overall_risk_str | |
| }, | |
| # ... (rest same) ... | |
| "opportunities_and_strengths": [ | |
| "Expand concrete data reporting", | |
| "Address external contradictions explicitly" | |
| ] if risk_level_code >= 1 else [ | |
| "Strong concrete data transparency", | |
| "Positive external sentiment alignment" | |
| ], | |
| "reviews_analysis": { | |
| "sentiment_score": reviews_sentiment['score'], | |
| "total_reviews_analyzed": len(reviews), | |
| "review_sources": reviews[:5] | |
| }, | |
| "recommended_actions": ai_recommendations, | |
| "hidden_patterns": [ | |
| {"pattern": "Vague Language", "description": "High usage of 'aims to' without dates"} | |
| ] if vague_ratio > 0.4 else [] | |
| } | |
| report_progress(f"Analysis complete: Score {final_score}/100", 100) | |
| return result | |