greenintellect / app /services /analysis_engine.py
Tanxshh's picture
Deploy GreenIntellect Backend API with ML models and scraping
02cc7f6
from datetime import datetime
from .pdf_processor import extract_text_from_pdf, split_sentences, clean_text
from .scraper import get_company_news, get_company_reviews, report_progress
from .scoring import calculate_scores, analyze_sentiment, analyze_aspect_sentiment, calculate_vague_score, calculate_concrete_score
from .llm_generator import generate_company_description, generate_ai_recommendations
# Aspect Keywords
EMISSION_KEYWORDS = ['emission', 'carbon', 'co2', 'greenhouse', 'pollution', 'net zero', 'carbon neutral']
ENERGY_KEYWORDS = ['energy', 'renewable', 'solar', 'wind', 'electricity', 'fuel', 'power']
WASTE_KEYWORDS = ['waste', 'recycling', 'plastic', 'circular economy', 'disposal', 'landfill']
def detect_contradictions(pdf_text, news_articles):
"""
Detect contradictions between company claims (PDF) and external reports (news)
Returns list of contradictions with evidence
"""
contradictions = []
# Keywords that indicate strong claims
claim_keywords = ['committed', 'achieved', 'reduced', 'eliminated', 'carbon neutral', 'net zero', 'sustainable']
# Keywords that indicate environmental context (Strict Physical Terms only)
# Removed generic words like 'green', 'sustainability', 'environmental' which appear in financial contexts
env_context = ['climate', 'carbon', 'emission', 'pollution', 'waste', 'biodiversity', 'fossil fuel', 'deforestation', 'ecological']
# Exclude regulators to avoid flagging financial fines as greenwashing
# (RBI, SEBI, SEC, etc.)
financial_exclusions = ['rbi', 'sebi', 'sec', 'money laundering', 'insider trading', 'stock market', 'shares', 'quarterly result']
for article in news_articles:
# Check if article is relevant to environment before counting it as a contradiction
text = (article['title'] + " " + article['content']).lower()
# Safety Check: If it mentions financial regulators/crimes, IGNORE even if it says "Green"
if any(ex in text for ex in financial_exclusions):
continue
if not any(k in text for k in env_context):
continue
for key in claim_keywords:
if key in text and any(neg in text for neg in ['false', 'misleading', 'investigation', 'lawsuit', 'fine', 'violation']):
contradictions.append({
"claim_type": "Environmental claim questioned",
"evidence": article['title'],
"source": article['url'],
"risk_level": "High"
})
break
# Keywords that indicate skepticism or allegations
skeptic_keywords = ['greenwashing', 'false claims', 'misleading', 'controversy', 'lawsuit', 'allegations']
pdf_lower = pdf_text.lower()
has_strong_claims = any(keyword in pdf_lower for keyword in claim_keywords)
if has_strong_claims:
for article in news_articles:
content_lower = article['content'].lower()
if any(keyword in content_lower for keyword in skeptic_keywords):
contradictions.append({
"claim_type": "Environmental commitment",
"evidence_url": article['url'],
"evidence_title": article['title'],
"severity": "High"
})
# New: General Compliance Risk Detection (Not just contradictions)
# Search for specific legal/compliance keywords in all articles
compliance_keywords = ['lawsuit', 'fine', 'penalty', 'violation', 'non-compliance', 'EPA', 'investigation', 'fraud', 'illegal']
for article in news_articles:
content_lower = article['content'].lower()
if any(keyword in content_lower for keyword in compliance_keywords):
contradictions.append({ # Leveraging the same list for now, or could create a separate list
"claim_type": "Regulatory Compliance Issue",
"evidence_url": article['url'],
"evidence_title": article['title'],
"severity": "Critical"
})
return contradictions
def detect_hidden_patterns(all_reviews):
"""
Analyze reviews to find hidden patterns:
- Sudden changes in sentiment
- Repeated phrases (astroturfing)
- Discrepancies between employee and customer reviews
"""
patterns = []
if len(all_reviews) > 10:
# Check for repeated phrases (potential fake reviews)
content_texts = [r['content'][:500] for r in all_reviews]
unique_ratio = len(set(content_texts)) / len(content_texts)
if unique_ratio < 0.7:
patterns.append({
"pattern": "Potential astroturfing detected",
"description": f"Only {int(unique_ratio*100)}% unique review content - may indicate coordinated posting",
"severity": "Medium"
})
# Check for platform discrepancies
glassdoor_reviews = [r for r in all_reviews if 'glassdoor' in r['url'].lower()]
reddit_reviews = [r for r in all_reviews if 'reddit' in r['url'].lower()]
if glassdoor_reviews and reddit_reviews:
patterns.append({
"pattern": "Multi-platform analysis available",
"description": f"Found {len(glassdoor_reviews)} Glassdoor and {len(reddit_reviews)} Reddit discussions for cross-validation",
"severity": "Info"
})
return patterns
async def analyze_company(company_name: str, pdf_path: str):
report_progress(f"Starting comprehensive analysis for {company_name}", 5)
# 1. Process PDF
report_progress("Processing PDF document...", 8)
pdf_text = extract_text_from_pdf(pdf_path)
pdf_sentences = split_sentences(pdf_text)
# --- PERPLEXITY AI INTEGRATION ---
from .perplexity_client import research_company, PERPLEXITY_API_KEY
pplx_data = None
if PERPLEXITY_API_KEY:
report_progress("Conducting deep research...", 15)
pplx_data = research_company(company_name)
# 2. Comprehensive Scraping (ALL available sources)
# Always run scraping to get real news, even if Perplexity is active
news_articles = await get_company_news(company_name)
# Optional: We can still use Perplexity findings for internal scoring without displaying them as 'news'
if pplx_data:
pass # Findings already in pplx_data for later use
# Progress 50-80% handled by get_company_reviews
reviews = await get_company_reviews(company_name)
# Progress 50-80% handled by get_company_reviews
reviews = await get_company_reviews(company_name)
# 3. Analyze PDF Content
report_progress("Analyzing PDF content...", 82)
pdf_scores = calculate_scores(pdf_sentences)
# 4. Detect Contradictions and Hidden Patterns
report_progress("Detecting contradictions and patterns...", 85)
contradictions = detect_contradictions(pdf_text, news_articles)
hidden_patterns = detect_hidden_patterns(reviews)
# 5. Analyze External Sentiment with ALL data
report_progress("Analyzing sentiment...", 90)
news_text = [a['content'] for a in news_articles]
reviews_text = [r['content'] for r in reviews]
all_external_text = news_text + reviews_text
news_sentiment = analyze_sentiment(news_text) if news_text else {'label': 'Neutral', 'score': 0.5}
reviews_sentiment = analyze_sentiment(reviews_text) if reviews_text else {'label': 'Neutral', 'score': 0.5}
# Aspect-based sentiment (REAL SCORES)
emission_sentiment = analyze_aspect_sentiment(all_external_text, EMISSION_KEYWORDS)
energy_sentiment = analyze_aspect_sentiment(all_external_text, ENERGY_KEYWORDS)
waste_sentiment = analyze_aspect_sentiment(all_external_text, WASTE_KEYWORDS)
# 6. Calculate Evidence-Based Score with detailed metrics
report_progress("Calculating final scores...", 95)
# Calculate detailed scores (REAL METRICS)
green_keyword_freq = pdf_scores['env_count'] / max(len(pdf_sentences), 1)
vague_ratio = calculate_vague_score(pdf_sentences)
concrete_ratio = calculate_concrete_score(pdf_sentences)
# --- IMPROVED SCORING FORMULA ---
# We now calculate the composite sentiment FIRST and let it drive the external portion of the score.
# See lines 340+ for where we normally calculated it. We'll do it here to affect the score.
# 1. Internal Sentiment
internal_sentiment_data = analyze_sentiment(pdf_scores['env_sentences'])
def get_linear_score_local(s_dict):
# Convert label+confidence to 0-100 scale
if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) # 50-100
if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) # 0-50
return 50 # Neutral
s_int = get_linear_score_local(internal_sentiment_data)
s_ext = get_linear_score_local(news_sentiment)
s_rev = get_linear_score_local(reviews_sentiment)
# 2. Composite Sentiment Score (0-100)
# 35% Internal (What they say) + 45% External (News) + 20% Reviews (Employee/Public)
composite_score_val = (s_int * 0.35) + (s_ext * 0.45) + (s_rev * 0.20)
# 3. Base Score Calculation
# We blend the Composite Sentiment (Qualitative) with Concrete Data (Quantitative)
# Start with the Sentiment Score (0-100)
final_score = composite_score_val
# Adjust based on Concrete Data (The "Proof")
# If they have high concrete data, boost the score.
# If they have high vague language, penalize the score.
score_modifier = 0
score_modifier += min(concrete_ratio * 100, 25) # Up to +25 points for concrete data
score_modifier -= min(vague_ratio * 50, 20) # Up to -20 points for vague language
# Apply modifier
final_score += score_modifier
# Contradiction Penalty (Facts Check)
if contradictions:
# Heavily penalize for contradictions
final_score -= (len(contradictions) * 15)
# Cap at 0-100
final_score = max(0, min(100, final_score))
# Calculate external sentiment gap
ext_gap = abs(news_sentiment['score'] - reviews_sentiment['score'])
# Determine label
if final_score >= 80: label = "Excellent"
elif final_score >= 60: label = "Good"
elif final_score >= 40: label = "Average"
elif final_score >= 20: label = "At Risk"
else: label = "Greenwashing"
# Determine risk level (3-State System)
# 2 = Greenwashing (High/Critical)
# 1 = At Risk (Medium)
# 0 = No Risk (Low)
risk_level_code = 0
risk_reasons = []
# 1. Contradictions (Immediate Greenwashing)
if contradictions:
risk_level_code = 2
risk_reasons.append("External contradictions found")
# 2. Score Thresholds
if final_score < 40:
risk_level_code = max(risk_level_code, 2)
risk_reasons.append(f"Critical Sustainability Score ({int(final_score)}/100)")
elif final_score < 60:
risk_level_code = max(risk_level_code, 1) # At Risk
# 3. Vague Language
if vague_ratio > 0.50 and concrete_ratio < 0.10:
risk_level_code = 2
risk_reasons.append("Excessive vague language")
elif vague_ratio > 0.40 and concrete_ratio < 0.20:
risk_level_code = max(risk_level_code, 1) # At Risk
# 4. Empty Claims
if news_sentiment['label'] == 'Positive' and concrete_ratio < 0.01:
risk_level_code = 2
risk_reasons.append("Positive press without concrete data")
# --- SAFE HARBOR OVERRIDE ---
high_risk_industries = ['coal', 'oil', 'petroleum', 'mining', 'gas', 'cement', 'steel', 'tobacco', 'power', 'thermal', 'adani']
is_high_risk = any(ind in company_name.lower() for ind in high_risk_industries)
pass_safe_harbor = False
if concrete_ratio > 0.05 and len(contradictions) < 2:
if is_high_risk:
if concrete_ratio > 0.20 and emission_sentiment['label'] == 'Positive':
pass_safe_harbor = True
else:
if risk_level_code < 2:
risk_level_code = 2
risk_reasons.append("High Risk Industry without exceptional mitigation")
elif emission_sentiment['label'] != 'Negative':
pass_safe_harbor = True
if pass_safe_harbor:
risk_level_code = 0 # Force No Risk
if risk_reasons:
risk_reasons = [f"Risk Mitigated: Sufficient concrete data ({round(concrete_ratio*100, 1)}%) provided."]
print(f"SAFE HARBOR TRIGGERED for {company_name}")
# Map code to string
# IMPACT: User requested specific labels
if risk_level_code == 2:
overall_risk_str = "Greenwashing"
greenwashing_flag = 1
elif risk_level_code == 1:
overall_risk_str = "At Risk"
greenwashing_flag = 0 # It's not "Greenwashing" per se, just risky? Or should flag be 1?
# Typically "Greenwashing" flag is binary for UI warnings, but let's keep it 1 only for High.
else:
overall_risk_str = "No Risk"
greenwashing_flag = 0
# Update reasons into result
if risk_reasons and risk_level_code >= 1:
pdf_scores['env_sentences'] = [f"[RISK] {r}" for r in risk_reasons] + pdf_scores['env_sentences']
# --- AI RECOMMENDATIONS & DESCRIPTION GENERATION ---
company_description = ""
ai_recommendations = {}
if pplx_data:
report_progress("Using insights...", 95)
company_description = pplx_data.get("description", "Description unavailable.")
ai_recommendations = pplx_data.get("recommendations", {})
else:
# Fallback to Gemini or defaults
try:
from .llm_generator import generate_company_description, generate_ai_recommendations
report_progress("Generating insights...", 98)
company_description = generate_company_description(company_name)
pre_result = {
"greenwashingLabel": greenwashing_flag,
"internal_documents_analysis": {"major_findings": pdf_scores['env_sentences'][:1]},
"contradictions_detected": contradictions,
"external_summary": {"public_sentiment": news_sentiment['label']}
}
ai_recommendations = generate_ai_recommendations(company_name, pre_result)
except Exception as e:
print(f"AI Generation fallback failed: {e}")
company_description = f"Analysis of {company_name}'s sustainability practices."
ai_recommendations = {
"customers": ["Review sustainability claims"],
"investors": ["Monitor ESG disclosures"],
"regulators": ["Standard compliance checks"]
}
# --- COMPOSITE SENTIMENT SCORE ---
# (calculation remains same)
internal_sentiment = analyze_sentiment(pdf_scores['env_sentences'])
def get_linear_score(s_dict):
if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50)
if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50)
return 50 # Neutral
int_s = get_linear_score(internal_sentiment)
ext_s = get_linear_score(news_sentiment)
rev_s = get_linear_score(reviews_sentiment)
composite_score = (int_s * 0.4) + (ext_s * 0.4) + (rev_s * 0.2)
composite_score_norm = composite_score / 100.0
# (AI generation already done above - using company_description and ai_recommendations)
# Update result
result = {
"company_name": company_name,
"company_description": company_description,
"last_updated": datetime.now().isoformat(),
"confidence_score": f"High ({len(news_articles) + len(reviews)} sources analyzed)",
"greenwashingLabel": greenwashing_flag, # 1 if Greenwashing, else 0 (Simplification for some binary UIs)
"detailed_scores": {
"green_keyword_frequency": round(green_keyword_freq, 3),
"vague_keyword_ratio": round(vague_ratio, 3),
"concrete_claim_ratio": round(concrete_ratio, 3),
"overall_sentiment": round(composite_score_norm, 3),
"internal_sentiment": round(internal_sentiment['score'], 3),
"external_sentiment": round(news_sentiment['score'], 3),
"external_sentiment_gap": round(ext_gap, 3),
"emission_sentiment": round(emission_sentiment['score'], 3),
"energy_sentiment": round(energy_sentiment['score'], 3),
"waste_sentiment": round(waste_sentiment['score'], 3),
"relative_focus_score": round(pdf_scores['env_count'] / max(len(pdf_sentences), 1), 3)
},
"external_summary": {
"key_highlights": [
f"Public Sentiment: {news_sentiment['label']}",
f"Risk Level: {overall_risk_str}"
],
# ...
"public_sentiment": news_sentiment['label'],
"recent_news_summary": f"Analysis of {len(news_articles)} articles.",
"possible_bias": "None",
"evidence_links": news_articles[:5]
},
"internal_documents_analysis": {
"major_findings": pdf_scores['env_sentences'][:5],
"compliance_risks": [f"Potential risk: {s[:50]}..." for s in pdf_scores['env_sentences'] if "aims to" in s][:3],
"performance_indicators": [s for s in pdf_scores['action_sentences'] if "%" in s][:5]
},
"risk_assessment": {
"financial_risk": "High" if risk_level_code == 2 else "Low",
"reputation_risk": "Critical" if risk_level_code == 2 else ("Medium" if risk_level_code == 1 else "Low"),
"compliance_risk": "High" if risk_level_code == 2 else "Low",
"market_risk": "Medium" if final_score < 50 else "Low",
# IMPACT: 3-State Output
"overall_risk_level": overall_risk_str
},
# ... (rest same) ...
"opportunities_and_strengths": [
"Expand concrete data reporting",
"Address external contradictions explicitly"
] if risk_level_code >= 1 else [
"Strong concrete data transparency",
"Positive external sentiment alignment"
],
"reviews_analysis": {
"sentiment_score": reviews_sentiment['score'],
"total_reviews_analyzed": len(reviews),
"review_sources": reviews[:5]
},
"recommended_actions": ai_recommendations,
"hidden_patterns": [
{"pattern": "Vague Language", "description": "High usage of 'aims to' without dates"}
] if vague_ratio > 0.4 else []
}
report_progress(f"Analysis complete: Score {final_score}/100", 100)
return result