File size: 18,951 Bytes
02cc7f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
from datetime import datetime
from .pdf_processor import extract_text_from_pdf, split_sentences, clean_text
from .scraper import get_company_news, get_company_reviews, report_progress
from .scoring import calculate_scores, analyze_sentiment, analyze_aspect_sentiment, calculate_vague_score, calculate_concrete_score
from .llm_generator import generate_company_description, generate_ai_recommendations

# Aspect Keywords
EMISSION_KEYWORDS = ['emission', 'carbon', 'co2', 'greenhouse', 'pollution', 'net zero', 'carbon neutral']
ENERGY_KEYWORDS = ['energy', 'renewable', 'solar', 'wind', 'electricity', 'fuel', 'power']
WASTE_KEYWORDS = ['waste', 'recycling', 'plastic', 'circular economy', 'disposal', 'landfill']

def detect_contradictions(pdf_text, news_articles):
    """
    Detect contradictions between company claims (PDF) and external reports (news)
    Returns list of contradictions with evidence
    """
    contradictions = []

    # Keywords that indicate strong claims
    claim_keywords = ['committed', 'achieved', 'reduced', 'eliminated', 'carbon neutral', 'net zero', 'sustainable']
    
    # Keywords that indicate environmental context (Strict Physical Terms only)
    # Removed generic words like 'green', 'sustainability', 'environmental' which appear in financial contexts
    env_context = ['climate', 'carbon', 'emission', 'pollution', 'waste', 'biodiversity', 'fossil fuel', 'deforestation', 'ecological']
    
    # Exclude regulators to avoid flagging financial fines as greenwashing
    # (RBI, SEBI, SEC, etc.)
    financial_exclusions = ['rbi', 'sebi', 'sec', 'money laundering', 'insider trading', 'stock market', 'shares', 'quarterly result']

    for article in news_articles:
        # Check if article is relevant to environment before counting it as a contradiction
        text = (article['title'] + " " + article['content']).lower()
        
        # Safety Check: If it mentions financial regulators/crimes, IGNORE even if it says "Green"
        if any(ex in text for ex in financial_exclusions):
            continue

        if not any(k in text for k in env_context):
            continue
            
        for key in claim_keywords:
            if key in text and any(neg in text for neg in ['false', 'misleading', 'investigation', 'lawsuit', 'fine', 'violation']):
               contradictions.append({
                   "claim_type": "Environmental claim questioned",
                   "evidence": article['title'],
                   "source": article['url'],
                   "risk_level": "High"
               })
               break
    
    # Keywords that indicate skepticism or allegations
    skeptic_keywords = ['greenwashing', 'false claims', 'misleading', 'controversy', 'lawsuit', 'allegations']
    
    pdf_lower = pdf_text.lower()
    has_strong_claims = any(keyword in pdf_lower for keyword in claim_keywords)
    
    if has_strong_claims:
        for article in news_articles:
            content_lower = article['content'].lower()
            if any(keyword in content_lower for keyword in skeptic_keywords):
                contradictions.append({
                    "claim_type": "Environmental commitment",
                    "evidence_url": article['url'],
                    "evidence_title": article['title'],
                    "severity": "High"
                })
                
    # New: General Compliance Risk Detection (Not just contradictions)
    # Search for specific legal/compliance keywords in all articles
    compliance_keywords = ['lawsuit', 'fine', 'penalty', 'violation', 'non-compliance', 'EPA', 'investigation', 'fraud', 'illegal']
    for article in news_articles:
        content_lower = article['content'].lower()
        if any(keyword in content_lower for keyword in compliance_keywords):
             contradictions.append({ # Leveraging the same list for now, or could create a separate list
                "claim_type": "Regulatory Compliance Issue",
                "evidence_url": article['url'],
                "evidence_title": article['title'],
                "severity": "Critical"
             })
    
    return contradictions

def detect_hidden_patterns(all_reviews):
    """
    Analyze reviews to find hidden patterns:
    - Sudden changes in sentiment
    - Repeated phrases (astroturfing)
    - Discrepancies between employee and customer reviews
    """
    patterns = []
    
    if len(all_reviews) > 10:
        # Check for repeated phrases (potential fake reviews)
        content_texts = [r['content'][:500] for r in all_reviews]
        unique_ratio = len(set(content_texts)) / len(content_texts)
        
        if unique_ratio < 0.7:
            patterns.append({
                "pattern": "Potential astroturfing detected",
                "description": f"Only {int(unique_ratio*100)}% unique review content - may indicate coordinated posting",
                "severity": "Medium"
            })
    
    # Check for platform discrepancies
    glassdoor_reviews = [r for r in all_reviews if 'glassdoor' in r['url'].lower()]
    reddit_reviews = [r for r in all_reviews if 'reddit' in r['url'].lower()]
    
    if glassdoor_reviews and reddit_reviews:
        patterns.append({
            "pattern": "Multi-platform analysis available",
            "description": f"Found {len(glassdoor_reviews)} Glassdoor and {len(reddit_reviews)} Reddit discussions for cross-validation",
            "severity": "Info"
        })
    
    return patterns

async def analyze_company(company_name: str, pdf_path: str):
    report_progress(f"Starting comprehensive analysis for {company_name}", 5)
    
    # 1. Process PDF
    report_progress("Processing PDF document...", 8)
    pdf_text = extract_text_from_pdf(pdf_path)
    pdf_sentences = split_sentences(pdf_text)
    
    # --- PERPLEXITY AI INTEGRATION ---
    from .perplexity_client import research_company, PERPLEXITY_API_KEY
    pplx_data = None
    
    if PERPLEXITY_API_KEY:
        report_progress("Conducting deep research...", 15)
        pplx_data = research_company(company_name)
        
    # 2. Comprehensive Scraping (ALL available sources)
    # Always run scraping to get real news, even if Perplexity is active
    news_articles = await get_company_news(company_name)
    
    # Optional: We can still use Perplexity findings for internal scoring without displaying them as 'news'
    if pplx_data:
         pass # Findings already in pplx_data for later use
    
    # Progress 50-80% handled by get_company_reviews  
    reviews = await get_company_reviews(company_name)
    
    # Progress 50-80% handled by get_company_reviews  
    reviews = await get_company_reviews(company_name)
    
    # 3. Analyze PDF Content
    report_progress("Analyzing PDF content...", 82)
    pdf_scores = calculate_scores(pdf_sentences)
    
    # 4. Detect Contradictions and Hidden Patterns
    report_progress("Detecting contradictions and patterns...", 85)
    contradictions = detect_contradictions(pdf_text, news_articles)
    hidden_patterns = detect_hidden_patterns(reviews)
    
    # 5. Analyze External Sentiment with ALL data
    report_progress("Analyzing sentiment...", 90)
    news_text = [a['content'] for a in news_articles]
    reviews_text = [r['content'] for r in reviews]
    all_external_text = news_text + reviews_text
    
    news_sentiment = analyze_sentiment(news_text) if news_text else {'label': 'Neutral', 'score': 0.5}
    reviews_sentiment = analyze_sentiment(reviews_text) if reviews_text else {'label': 'Neutral', 'score': 0.5}
    
    # Aspect-based sentiment (REAL SCORES)
    emission_sentiment = analyze_aspect_sentiment(all_external_text, EMISSION_KEYWORDS)
    energy_sentiment = analyze_aspect_sentiment(all_external_text, ENERGY_KEYWORDS)
    waste_sentiment = analyze_aspect_sentiment(all_external_text, WASTE_KEYWORDS)
    
    # 6. Calculate Evidence-Based Score with detailed metrics
    report_progress("Calculating final scores...", 95)
    
    # Calculate detailed scores (REAL METRICS)
    green_keyword_freq = pdf_scores['env_count'] / max(len(pdf_sentences), 1)
    vague_ratio = calculate_vague_score(pdf_sentences)
    concrete_ratio = calculate_concrete_score(pdf_sentences)
    
    # --- IMPROVED SCORING FORMULA ---
    # We now calculate the composite sentiment FIRST and let it drive the external portion of the score.
    # See lines 340+ for where we normally calculated it. We'll do it here to affect the score.
    
    # 1. Internal Sentiment
    internal_sentiment_data = analyze_sentiment(pdf_scores['env_sentences'])
    
    def get_linear_score_local(s_dict):
        # Convert label+confidence to 0-100 scale
        if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) # 50-100
        if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) # 0-50
        return 50 # Neutral

    s_int = get_linear_score_local(internal_sentiment_data)
    s_ext = get_linear_score_local(news_sentiment)
    s_rev = get_linear_score_local(reviews_sentiment)
    
    # 2. Composite Sentiment Score (0-100)
    # 35% Internal (What they say) + 45% External (News) + 20% Reviews (Employee/Public)
    composite_score_val = (s_int * 0.35) + (s_ext * 0.45) + (s_rev * 0.20)
    
    # 3. Base Score Calculation
    # We blend the Composite Sentiment (Qualitative) with Concrete Data (Quantitative)
    
    # Start with the Sentiment Score (0-100)
    final_score = composite_score_val
    
    # Adjust based on Concrete Data (The "Proof")
    # If they have high concrete data, boost the score.
    # If they have high vague language, penalize the score.
    
    score_modifier = 0
    score_modifier += min(concrete_ratio * 100, 25) # Up to +25 points for concrete data
    score_modifier -= min(vague_ratio * 50, 20)     # Up to -20 points for vague language
    
    # Apply modifier
    final_score += score_modifier
    
    # Contradiction Penalty (Facts Check)
    if contradictions:
        # Heavily penalize for contradictions
        final_score -= (len(contradictions) * 15)
        
    # Cap at 0-100
    final_score = max(0, min(100, final_score))

    
    # Calculate external sentiment gap
    ext_gap = abs(news_sentiment['score'] - reviews_sentiment['score'])
        # Determine label
    if final_score >= 80: label = "Excellent"
    elif final_score >= 60: label = "Good"
    elif final_score >= 40: label = "Average"
    elif final_score >= 20: label = "At Risk"
    else: label = "Greenwashing"
    
    # Determine risk level (3-State System)
    # 2 = Greenwashing (High/Critical)
    # 1 = At Risk (Medium)
    # 0 = No Risk (Low)
    risk_level_code = 0
    risk_reasons = []

    # 1. Contradictions (Immediate Greenwashing)
    if contradictions:
        risk_level_code = 2
        risk_reasons.append("External contradictions found")

    # 2. Score Thresholds
    if final_score < 40:
        risk_level_code = max(risk_level_code, 2)
        risk_reasons.append(f"Critical Sustainability Score ({int(final_score)}/100)")
    elif final_score < 60:
        risk_level_code = max(risk_level_code, 1) # At Risk
        
    # 3. Vague Language
    if vague_ratio > 0.50 and concrete_ratio < 0.10:
        risk_level_code = 2
        risk_reasons.append("Excessive vague language")
    elif vague_ratio > 0.40 and concrete_ratio < 0.20:
        risk_level_code = max(risk_level_code, 1) # At Risk

    # 4. Empty Claims
    if news_sentiment['label'] == 'Positive' and concrete_ratio < 0.01:
        risk_level_code = 2
        risk_reasons.append("Positive press without concrete data")
    
    # --- SAFE HARBOR OVERRIDE ---
    high_risk_industries = ['coal', 'oil', 'petroleum', 'mining', 'gas', 'cement', 'steel', 'tobacco', 'power', 'thermal', 'adani']
    is_high_risk = any(ind in company_name.lower() for ind in high_risk_industries)
    
    pass_safe_harbor = False
    if concrete_ratio > 0.05 and len(contradictions) < 2:
        if is_high_risk:
             if concrete_ratio > 0.20 and emission_sentiment['label'] == 'Positive':
                 pass_safe_harbor = True
             else:
                 if risk_level_code < 2:
                     risk_level_code = 2
                     risk_reasons.append("High Risk Industry without exceptional mitigation")
        elif emission_sentiment['label'] != 'Negative':
             pass_safe_harbor = True
             
    if pass_safe_harbor:
        risk_level_code = 0 # Force No Risk
        if risk_reasons:
             risk_reasons = [f"Risk Mitigated: Sufficient concrete data ({round(concrete_ratio*100, 1)}%) provided."]
        print(f"SAFE HARBOR TRIGGERED for {company_name}")

    # Map code to string
    # IMPACT: User requested specific labels
    if risk_level_code == 2:
        overall_risk_str = "Greenwashing"
        greenwashing_flag = 1
    elif risk_level_code == 1:
        overall_risk_str = "At Risk"
        greenwashing_flag = 0 # It's not "Greenwashing" per se, just risky? Or should flag be 1? 
        # Typically "Greenwashing" flag is binary for UI warnings, but let's keep it 1 only for High.
    else:
        overall_risk_str = "No Risk"
        greenwashing_flag = 0

    # Update reasons into result
    if risk_reasons and risk_level_code >= 1:
        pdf_scores['env_sentences'] = [f"[RISK] {r}" for r in risk_reasons] + pdf_scores['env_sentences']
    
    # --- AI RECOMMENDATIONS & DESCRIPTION GENERATION ---
    company_description = ""
    ai_recommendations = {}
    
    if pplx_data:
        report_progress("Using insights...", 95)
        company_description = pplx_data.get("description", "Description unavailable.")
        ai_recommendations = pplx_data.get("recommendations", {})
    else:
        # Fallback to Gemini or defaults
        try:
            from .llm_generator import generate_company_description, generate_ai_recommendations
            report_progress("Generating insights...", 98)
            company_description = generate_company_description(company_name)
            
            pre_result = {
                "greenwashingLabel": greenwashing_flag,
                "internal_documents_analysis": {"major_findings": pdf_scores['env_sentences'][:1]},
                "contradictions_detected": contradictions,
                "external_summary": {"public_sentiment": news_sentiment['label']}
            }
            ai_recommendations = generate_ai_recommendations(company_name, pre_result)
        except Exception as e:
            print(f"AI Generation fallback failed: {e}")
            company_description = f"Analysis of {company_name}'s sustainability practices."
            ai_recommendations = {
                "customers": ["Review sustainability claims"],
                "investors": ["Monitor ESG disclosures"],
                "regulators": ["Standard compliance checks"]
            }
    
    # --- COMPOSITE SENTIMENT SCORE ---
    # (calculation remains same)
    internal_sentiment = analyze_sentiment(pdf_scores['env_sentences'])
    
    def get_linear_score(s_dict):
        if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50)
        if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50)
        return 50 # Neutral
        
    int_s = get_linear_score(internal_sentiment)
    ext_s = get_linear_score(news_sentiment)
    rev_s = get_linear_score(reviews_sentiment)
    
    composite_score = (int_s * 0.4) + (ext_s * 0.4) + (rev_s * 0.2)
    composite_score_norm = composite_score / 100.0

    # (AI generation already done above - using company_description and ai_recommendations)

    # Update result
    result = {
        "company_name": company_name,
        "company_description": company_description, 
        "last_updated": datetime.now().isoformat(),
        "confidence_score": f"High ({len(news_articles) + len(reviews)} sources analyzed)",
        "greenwashingLabel": greenwashing_flag, # 1 if Greenwashing, else 0 (Simplification for some binary UIs)
        
        "detailed_scores": {
            "green_keyword_frequency": round(green_keyword_freq, 3),
            "vague_keyword_ratio": round(vague_ratio, 3),
            "concrete_claim_ratio": round(concrete_ratio, 3),
            "overall_sentiment": round(composite_score_norm, 3), 
            "internal_sentiment": round(internal_sentiment['score'], 3),
            "external_sentiment": round(news_sentiment['score'], 3),
            "external_sentiment_gap": round(ext_gap, 3),
            "emission_sentiment": round(emission_sentiment['score'], 3),
            "energy_sentiment": round(energy_sentiment['score'], 3),
            "waste_sentiment": round(waste_sentiment['score'], 3),
            "relative_focus_score": round(pdf_scores['env_count'] / max(len(pdf_sentences), 1), 3)
        },
        
        "external_summary": {
            "key_highlights": [
                f"Public Sentiment: {news_sentiment['label']}", 
                f"Risk Level: {overall_risk_str}"
            ],
            # ...
            "public_sentiment": news_sentiment['label'],
            "recent_news_summary": f"Analysis of {len(news_articles)} articles.",
            "possible_bias": "None",
            "evidence_links": news_articles[:5]
        },
        
        "internal_documents_analysis": {
            "major_findings": pdf_scores['env_sentences'][:5],
            "compliance_risks": [f"Potential risk: {s[:50]}..." for s in pdf_scores['env_sentences'] if "aims to" in s][:3],
            "performance_indicators": [s for s in pdf_scores['action_sentences'] if "%" in s][:5]
        },
        
        "risk_assessment": {
            "financial_risk": "High" if risk_level_code == 2 else "Low", 
            "reputation_risk": "Critical" if risk_level_code == 2 else ("Medium" if risk_level_code == 1 else "Low"),
            "compliance_risk": "High" if risk_level_code == 2 else "Low",
            "market_risk": "Medium" if final_score < 50 else "Low",
            # IMPACT: 3-State Output
            "overall_risk_level": overall_risk_str
        },
        
        # ... (rest same) ...
        "opportunities_and_strengths": [
             "Expand concrete data reporting",
             "Address external contradictions explicitly"
        ] if risk_level_code >= 1 else [
             "Strong concrete data transparency",
             "Positive external sentiment alignment"
        ],
        
        "reviews_analysis": {
            "sentiment_score": reviews_sentiment['score'], 
            "total_reviews_analyzed": len(reviews),
            "review_sources": reviews[:5] 
        },
        
        "recommended_actions": ai_recommendations,
        
        "hidden_patterns": [
            {"pattern": "Vague Language", "description": "High usage of 'aims to' without dates"}
        ] if vague_ratio > 0.4 else []
    }
    
    report_progress(f"Analysis complete: Score {final_score}/100", 100)
    return result