Spaces:
Sleeping
Sleeping
File size: 18,951 Bytes
02cc7f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 |
from datetime import datetime
from .pdf_processor import extract_text_from_pdf, split_sentences, clean_text
from .scraper import get_company_news, get_company_reviews, report_progress
from .scoring import calculate_scores, analyze_sentiment, analyze_aspect_sentiment, calculate_vague_score, calculate_concrete_score
from .llm_generator import generate_company_description, generate_ai_recommendations
# Aspect Keywords
EMISSION_KEYWORDS = ['emission', 'carbon', 'co2', 'greenhouse', 'pollution', 'net zero', 'carbon neutral']
ENERGY_KEYWORDS = ['energy', 'renewable', 'solar', 'wind', 'electricity', 'fuel', 'power']
WASTE_KEYWORDS = ['waste', 'recycling', 'plastic', 'circular economy', 'disposal', 'landfill']
def detect_contradictions(pdf_text, news_articles):
"""
Detect contradictions between company claims (PDF) and external reports (news)
Returns list of contradictions with evidence
"""
contradictions = []
# Keywords that indicate strong claims
claim_keywords = ['committed', 'achieved', 'reduced', 'eliminated', 'carbon neutral', 'net zero', 'sustainable']
# Keywords that indicate environmental context (Strict Physical Terms only)
# Removed generic words like 'green', 'sustainability', 'environmental' which appear in financial contexts
env_context = ['climate', 'carbon', 'emission', 'pollution', 'waste', 'biodiversity', 'fossil fuel', 'deforestation', 'ecological']
# Exclude regulators to avoid flagging financial fines as greenwashing
# (RBI, SEBI, SEC, etc.)
financial_exclusions = ['rbi', 'sebi', 'sec', 'money laundering', 'insider trading', 'stock market', 'shares', 'quarterly result']
for article in news_articles:
# Check if article is relevant to environment before counting it as a contradiction
text = (article['title'] + " " + article['content']).lower()
# Safety Check: If it mentions financial regulators/crimes, IGNORE even if it says "Green"
if any(ex in text for ex in financial_exclusions):
continue
if not any(k in text for k in env_context):
continue
for key in claim_keywords:
if key in text and any(neg in text for neg in ['false', 'misleading', 'investigation', 'lawsuit', 'fine', 'violation']):
contradictions.append({
"claim_type": "Environmental claim questioned",
"evidence": article['title'],
"source": article['url'],
"risk_level": "High"
})
break
# Keywords that indicate skepticism or allegations
skeptic_keywords = ['greenwashing', 'false claims', 'misleading', 'controversy', 'lawsuit', 'allegations']
pdf_lower = pdf_text.lower()
has_strong_claims = any(keyword in pdf_lower for keyword in claim_keywords)
if has_strong_claims:
for article in news_articles:
content_lower = article['content'].lower()
if any(keyword in content_lower for keyword in skeptic_keywords):
contradictions.append({
"claim_type": "Environmental commitment",
"evidence_url": article['url'],
"evidence_title": article['title'],
"severity": "High"
})
# New: General Compliance Risk Detection (Not just contradictions)
# Search for specific legal/compliance keywords in all articles
compliance_keywords = ['lawsuit', 'fine', 'penalty', 'violation', 'non-compliance', 'EPA', 'investigation', 'fraud', 'illegal']
for article in news_articles:
content_lower = article['content'].lower()
if any(keyword in content_lower for keyword in compliance_keywords):
contradictions.append({ # Leveraging the same list for now, or could create a separate list
"claim_type": "Regulatory Compliance Issue",
"evidence_url": article['url'],
"evidence_title": article['title'],
"severity": "Critical"
})
return contradictions
def detect_hidden_patterns(all_reviews):
"""
Analyze reviews to find hidden patterns:
- Sudden changes in sentiment
- Repeated phrases (astroturfing)
- Discrepancies between employee and customer reviews
"""
patterns = []
if len(all_reviews) > 10:
# Check for repeated phrases (potential fake reviews)
content_texts = [r['content'][:500] for r in all_reviews]
unique_ratio = len(set(content_texts)) / len(content_texts)
if unique_ratio < 0.7:
patterns.append({
"pattern": "Potential astroturfing detected",
"description": f"Only {int(unique_ratio*100)}% unique review content - may indicate coordinated posting",
"severity": "Medium"
})
# Check for platform discrepancies
glassdoor_reviews = [r for r in all_reviews if 'glassdoor' in r['url'].lower()]
reddit_reviews = [r for r in all_reviews if 'reddit' in r['url'].lower()]
if glassdoor_reviews and reddit_reviews:
patterns.append({
"pattern": "Multi-platform analysis available",
"description": f"Found {len(glassdoor_reviews)} Glassdoor and {len(reddit_reviews)} Reddit discussions for cross-validation",
"severity": "Info"
})
return patterns
async def analyze_company(company_name: str, pdf_path: str):
report_progress(f"Starting comprehensive analysis for {company_name}", 5)
# 1. Process PDF
report_progress("Processing PDF document...", 8)
pdf_text = extract_text_from_pdf(pdf_path)
pdf_sentences = split_sentences(pdf_text)
# --- PERPLEXITY AI INTEGRATION ---
from .perplexity_client import research_company, PERPLEXITY_API_KEY
pplx_data = None
if PERPLEXITY_API_KEY:
report_progress("Conducting deep research...", 15)
pplx_data = research_company(company_name)
# 2. Comprehensive Scraping (ALL available sources)
# Always run scraping to get real news, even if Perplexity is active
news_articles = await get_company_news(company_name)
# Optional: We can still use Perplexity findings for internal scoring without displaying them as 'news'
if pplx_data:
pass # Findings already in pplx_data for later use
# Progress 50-80% handled by get_company_reviews
reviews = await get_company_reviews(company_name)
# Progress 50-80% handled by get_company_reviews
reviews = await get_company_reviews(company_name)
# 3. Analyze PDF Content
report_progress("Analyzing PDF content...", 82)
pdf_scores = calculate_scores(pdf_sentences)
# 4. Detect Contradictions and Hidden Patterns
report_progress("Detecting contradictions and patterns...", 85)
contradictions = detect_contradictions(pdf_text, news_articles)
hidden_patterns = detect_hidden_patterns(reviews)
# 5. Analyze External Sentiment with ALL data
report_progress("Analyzing sentiment...", 90)
news_text = [a['content'] for a in news_articles]
reviews_text = [r['content'] for r in reviews]
all_external_text = news_text + reviews_text
news_sentiment = analyze_sentiment(news_text) if news_text else {'label': 'Neutral', 'score': 0.5}
reviews_sentiment = analyze_sentiment(reviews_text) if reviews_text else {'label': 'Neutral', 'score': 0.5}
# Aspect-based sentiment (REAL SCORES)
emission_sentiment = analyze_aspect_sentiment(all_external_text, EMISSION_KEYWORDS)
energy_sentiment = analyze_aspect_sentiment(all_external_text, ENERGY_KEYWORDS)
waste_sentiment = analyze_aspect_sentiment(all_external_text, WASTE_KEYWORDS)
# 6. Calculate Evidence-Based Score with detailed metrics
report_progress("Calculating final scores...", 95)
# Calculate detailed scores (REAL METRICS)
green_keyword_freq = pdf_scores['env_count'] / max(len(pdf_sentences), 1)
vague_ratio = calculate_vague_score(pdf_sentences)
concrete_ratio = calculate_concrete_score(pdf_sentences)
# --- IMPROVED SCORING FORMULA ---
# We now calculate the composite sentiment FIRST and let it drive the external portion of the score.
# See lines 340+ for where we normally calculated it. We'll do it here to affect the score.
# 1. Internal Sentiment
internal_sentiment_data = analyze_sentiment(pdf_scores['env_sentences'])
def get_linear_score_local(s_dict):
# Convert label+confidence to 0-100 scale
if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) # 50-100
if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) # 0-50
return 50 # Neutral
s_int = get_linear_score_local(internal_sentiment_data)
s_ext = get_linear_score_local(news_sentiment)
s_rev = get_linear_score_local(reviews_sentiment)
# 2. Composite Sentiment Score (0-100)
# 35% Internal (What they say) + 45% External (News) + 20% Reviews (Employee/Public)
composite_score_val = (s_int * 0.35) + (s_ext * 0.45) + (s_rev * 0.20)
# 3. Base Score Calculation
# We blend the Composite Sentiment (Qualitative) with Concrete Data (Quantitative)
# Start with the Sentiment Score (0-100)
final_score = composite_score_val
# Adjust based on Concrete Data (The "Proof")
# If they have high concrete data, boost the score.
# If they have high vague language, penalize the score.
score_modifier = 0
score_modifier += min(concrete_ratio * 100, 25) # Up to +25 points for concrete data
score_modifier -= min(vague_ratio * 50, 20) # Up to -20 points for vague language
# Apply modifier
final_score += score_modifier
# Contradiction Penalty (Facts Check)
if contradictions:
# Heavily penalize for contradictions
final_score -= (len(contradictions) * 15)
# Cap at 0-100
final_score = max(0, min(100, final_score))
# Calculate external sentiment gap
ext_gap = abs(news_sentiment['score'] - reviews_sentiment['score'])
# Determine label
if final_score >= 80: label = "Excellent"
elif final_score >= 60: label = "Good"
elif final_score >= 40: label = "Average"
elif final_score >= 20: label = "At Risk"
else: label = "Greenwashing"
# Determine risk level (3-State System)
# 2 = Greenwashing (High/Critical)
# 1 = At Risk (Medium)
# 0 = No Risk (Low)
risk_level_code = 0
risk_reasons = []
# 1. Contradictions (Immediate Greenwashing)
if contradictions:
risk_level_code = 2
risk_reasons.append("External contradictions found")
# 2. Score Thresholds
if final_score < 40:
risk_level_code = max(risk_level_code, 2)
risk_reasons.append(f"Critical Sustainability Score ({int(final_score)}/100)")
elif final_score < 60:
risk_level_code = max(risk_level_code, 1) # At Risk
# 3. Vague Language
if vague_ratio > 0.50 and concrete_ratio < 0.10:
risk_level_code = 2
risk_reasons.append("Excessive vague language")
elif vague_ratio > 0.40 and concrete_ratio < 0.20:
risk_level_code = max(risk_level_code, 1) # At Risk
# 4. Empty Claims
if news_sentiment['label'] == 'Positive' and concrete_ratio < 0.01:
risk_level_code = 2
risk_reasons.append("Positive press without concrete data")
# --- SAFE HARBOR OVERRIDE ---
high_risk_industries = ['coal', 'oil', 'petroleum', 'mining', 'gas', 'cement', 'steel', 'tobacco', 'power', 'thermal', 'adani']
is_high_risk = any(ind in company_name.lower() for ind in high_risk_industries)
pass_safe_harbor = False
if concrete_ratio > 0.05 and len(contradictions) < 2:
if is_high_risk:
if concrete_ratio > 0.20 and emission_sentiment['label'] == 'Positive':
pass_safe_harbor = True
else:
if risk_level_code < 2:
risk_level_code = 2
risk_reasons.append("High Risk Industry without exceptional mitigation")
elif emission_sentiment['label'] != 'Negative':
pass_safe_harbor = True
if pass_safe_harbor:
risk_level_code = 0 # Force No Risk
if risk_reasons:
risk_reasons = [f"Risk Mitigated: Sufficient concrete data ({round(concrete_ratio*100, 1)}%) provided."]
print(f"SAFE HARBOR TRIGGERED for {company_name}")
# Map code to string
# IMPACT: User requested specific labels
if risk_level_code == 2:
overall_risk_str = "Greenwashing"
greenwashing_flag = 1
elif risk_level_code == 1:
overall_risk_str = "At Risk"
greenwashing_flag = 0 # It's not "Greenwashing" per se, just risky? Or should flag be 1?
# Typically "Greenwashing" flag is binary for UI warnings, but let's keep it 1 only for High.
else:
overall_risk_str = "No Risk"
greenwashing_flag = 0
# Update reasons into result
if risk_reasons and risk_level_code >= 1:
pdf_scores['env_sentences'] = [f"[RISK] {r}" for r in risk_reasons] + pdf_scores['env_sentences']
# --- AI RECOMMENDATIONS & DESCRIPTION GENERATION ---
company_description = ""
ai_recommendations = {}
if pplx_data:
report_progress("Using insights...", 95)
company_description = pplx_data.get("description", "Description unavailable.")
ai_recommendations = pplx_data.get("recommendations", {})
else:
# Fallback to Gemini or defaults
try:
from .llm_generator import generate_company_description, generate_ai_recommendations
report_progress("Generating insights...", 98)
company_description = generate_company_description(company_name)
pre_result = {
"greenwashingLabel": greenwashing_flag,
"internal_documents_analysis": {"major_findings": pdf_scores['env_sentences'][:1]},
"contradictions_detected": contradictions,
"external_summary": {"public_sentiment": news_sentiment['label']}
}
ai_recommendations = generate_ai_recommendations(company_name, pre_result)
except Exception as e:
print(f"AI Generation fallback failed: {e}")
company_description = f"Analysis of {company_name}'s sustainability practices."
ai_recommendations = {
"customers": ["Review sustainability claims"],
"investors": ["Monitor ESG disclosures"],
"regulators": ["Standard compliance checks"]
}
# --- COMPOSITE SENTIMENT SCORE ---
# (calculation remains same)
internal_sentiment = analyze_sentiment(pdf_scores['env_sentences'])
def get_linear_score(s_dict):
if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50)
if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50)
return 50 # Neutral
int_s = get_linear_score(internal_sentiment)
ext_s = get_linear_score(news_sentiment)
rev_s = get_linear_score(reviews_sentiment)
composite_score = (int_s * 0.4) + (ext_s * 0.4) + (rev_s * 0.2)
composite_score_norm = composite_score / 100.0
# (AI generation already done above - using company_description and ai_recommendations)
# Update result
result = {
"company_name": company_name,
"company_description": company_description,
"last_updated": datetime.now().isoformat(),
"confidence_score": f"High ({len(news_articles) + len(reviews)} sources analyzed)",
"greenwashingLabel": greenwashing_flag, # 1 if Greenwashing, else 0 (Simplification for some binary UIs)
"detailed_scores": {
"green_keyword_frequency": round(green_keyword_freq, 3),
"vague_keyword_ratio": round(vague_ratio, 3),
"concrete_claim_ratio": round(concrete_ratio, 3),
"overall_sentiment": round(composite_score_norm, 3),
"internal_sentiment": round(internal_sentiment['score'], 3),
"external_sentiment": round(news_sentiment['score'], 3),
"external_sentiment_gap": round(ext_gap, 3),
"emission_sentiment": round(emission_sentiment['score'], 3),
"energy_sentiment": round(energy_sentiment['score'], 3),
"waste_sentiment": round(waste_sentiment['score'], 3),
"relative_focus_score": round(pdf_scores['env_count'] / max(len(pdf_sentences), 1), 3)
},
"external_summary": {
"key_highlights": [
f"Public Sentiment: {news_sentiment['label']}",
f"Risk Level: {overall_risk_str}"
],
# ...
"public_sentiment": news_sentiment['label'],
"recent_news_summary": f"Analysis of {len(news_articles)} articles.",
"possible_bias": "None",
"evidence_links": news_articles[:5]
},
"internal_documents_analysis": {
"major_findings": pdf_scores['env_sentences'][:5],
"compliance_risks": [f"Potential risk: {s[:50]}..." for s in pdf_scores['env_sentences'] if "aims to" in s][:3],
"performance_indicators": [s for s in pdf_scores['action_sentences'] if "%" in s][:5]
},
"risk_assessment": {
"financial_risk": "High" if risk_level_code == 2 else "Low",
"reputation_risk": "Critical" if risk_level_code == 2 else ("Medium" if risk_level_code == 1 else "Low"),
"compliance_risk": "High" if risk_level_code == 2 else "Low",
"market_risk": "Medium" if final_score < 50 else "Low",
# IMPACT: 3-State Output
"overall_risk_level": overall_risk_str
},
# ... (rest same) ...
"opportunities_and_strengths": [
"Expand concrete data reporting",
"Address external contradictions explicitly"
] if risk_level_code >= 1 else [
"Strong concrete data transparency",
"Positive external sentiment alignment"
],
"reviews_analysis": {
"sentiment_score": reviews_sentiment['score'],
"total_reviews_analyzed": len(reviews),
"review_sources": reviews[:5]
},
"recommended_actions": ai_recommendations,
"hidden_patterns": [
{"pattern": "Vague Language", "description": "High usage of 'aims to' without dates"}
] if vague_ratio > 0.4 else []
}
report_progress(f"Analysis complete: Score {final_score}/100", 100)
return result
|