Spaces:

Tanxshh
/

greenintellect

Sleeping

App Files Files Community

greenintellect / app /services /analysis_engine.py

Tanxshh

Deploy GreenIntellect Backend API with ML models and scraping

02cc7f6 about 1 month ago

raw

history blame contribute delete

19 kB

	from datetime import datetime
	from .pdf_processor import extract_text_from_pdf, split_sentences, clean_text
	from .scraper import get_company_news, get_company_reviews, report_progress
	from .scoring import calculate_scores, analyze_sentiment, analyze_aspect_sentiment, calculate_vague_score, calculate_concrete_score
	from .llm_generator import generate_company_description, generate_ai_recommendations

	# Aspect Keywords
	EMISSION_KEYWORDS = ['emission', 'carbon', 'co2', 'greenhouse', 'pollution', 'net zero', 'carbon neutral']
	ENERGY_KEYWORDS = ['energy', 'renewable', 'solar', 'wind', 'electricity', 'fuel', 'power']
	WASTE_KEYWORDS = ['waste', 'recycling', 'plastic', 'circular economy', 'disposal', 'landfill']

	def detect_contradictions(pdf_text, news_articles):
	"""
	Detect contradictions between company claims (PDF) and external reports (news)
	Returns list of contradictions with evidence
	"""
	contradictions = []

	# Keywords that indicate strong claims
	claim_keywords = ['committed', 'achieved', 'reduced', 'eliminated', 'carbon neutral', 'net zero', 'sustainable']

	# Keywords that indicate environmental context (Strict Physical Terms only)
	# Removed generic words like 'green', 'sustainability', 'environmental' which appear in financial contexts
	env_context = ['climate', 'carbon', 'emission', 'pollution', 'waste', 'biodiversity', 'fossil fuel', 'deforestation', 'ecological']

	# Exclude regulators to avoid flagging financial fines as greenwashing
	# (RBI, SEBI, SEC, etc.)
	financial_exclusions = ['rbi', 'sebi', 'sec', 'money laundering', 'insider trading', 'stock market', 'shares', 'quarterly result']

	for article in news_articles:
	# Check if article is relevant to environment before counting it as a contradiction
	text = (article['title'] + " " + article['content']).lower()

	# Safety Check: If it mentions financial regulators/crimes, IGNORE even if it says "Green"
	if any(ex in text for ex in financial_exclusions):
	continue

	if not any(k in text for k in env_context):
	continue

	for key in claim_keywords:
	if key in text and any(neg in text for neg in ['false', 'misleading', 'investigation', 'lawsuit', 'fine', 'violation']):
	contradictions.append({
	"claim_type": "Environmental claim questioned",
	"evidence": article['title'],
	"source": article['url'],
	"risk_level": "High"
	})
	break

	# Keywords that indicate skepticism or allegations
	skeptic_keywords = ['greenwashing', 'false claims', 'misleading', 'controversy', 'lawsuit', 'allegations']

	pdf_lower = pdf_text.lower()
	has_strong_claims = any(keyword in pdf_lower for keyword in claim_keywords)

	if has_strong_claims:
	for article in news_articles:
	content_lower = article['content'].lower()
	if any(keyword in content_lower for keyword in skeptic_keywords):
	contradictions.append({
	"claim_type": "Environmental commitment",
	"evidence_url": article['url'],
	"evidence_title": article['title'],
	"severity": "High"
	})

	# New: General Compliance Risk Detection (Not just contradictions)
	# Search for specific legal/compliance keywords in all articles
	compliance_keywords = ['lawsuit', 'fine', 'penalty', 'violation', 'non-compliance', 'EPA', 'investigation', 'fraud', 'illegal']
	for article in news_articles:
	content_lower = article['content'].lower()
	if any(keyword in content_lower for keyword in compliance_keywords):
	contradictions.append({ # Leveraging the same list for now, or could create a separate list
	"claim_type": "Regulatory Compliance Issue",
	"evidence_url": article['url'],
	"evidence_title": article['title'],
	"severity": "Critical"
	})

	return contradictions

	def detect_hidden_patterns(all_reviews):
	"""
	Analyze reviews to find hidden patterns:
	- Sudden changes in sentiment
	- Repeated phrases (astroturfing)
	- Discrepancies between employee and customer reviews
	"""
	patterns = []

	if len(all_reviews) > 10:
	# Check for repeated phrases (potential fake reviews)
	content_texts = [r['content'][:500] for r in all_reviews]
	unique_ratio = len(set(content_texts)) / len(content_texts)

	if unique_ratio < 0.7:
	patterns.append({
	"pattern": "Potential astroturfing detected",
	"description": f"Only {int(unique_ratio*100)}% unique review content - may indicate coordinated posting",
	"severity": "Medium"
	})

	# Check for platform discrepancies
	glassdoor_reviews = [r for r in all_reviews if 'glassdoor' in r['url'].lower()]
	reddit_reviews = [r for r in all_reviews if 'reddit' in r['url'].lower()]

	if glassdoor_reviews and reddit_reviews:
	patterns.append({
	"pattern": "Multi-platform analysis available",
	"description": f"Found {len(glassdoor_reviews)} Glassdoor and {len(reddit_reviews)} Reddit discussions for cross-validation",
	"severity": "Info"
	})

	return patterns

	async def analyze_company(company_name: str, pdf_path: str):
	report_progress(f"Starting comprehensive analysis for {company_name}", 5)

	# 1. Process PDF
	report_progress("Processing PDF document...", 8)
	pdf_text = extract_text_from_pdf(pdf_path)
	pdf_sentences = split_sentences(pdf_text)

	# --- PERPLEXITY AI INTEGRATION ---
	from .perplexity_client import research_company, PERPLEXITY_API_KEY
	pplx_data = None

	if PERPLEXITY_API_KEY:
	report_progress("Conducting deep research...", 15)
	pplx_data = research_company(company_name)

	# 2. Comprehensive Scraping (ALL available sources)
	# Always run scraping to get real news, even if Perplexity is active
	news_articles = await get_company_news(company_name)

	# Optional: We can still use Perplexity findings for internal scoring without displaying them as 'news'
	if pplx_data:
	pass # Findings already in pplx_data for later use

	# Progress 50-80% handled by get_company_reviews
	reviews = await get_company_reviews(company_name)

	# Progress 50-80% handled by get_company_reviews
	reviews = await get_company_reviews(company_name)

	# 3. Analyze PDF Content
	report_progress("Analyzing PDF content...", 82)
	pdf_scores = calculate_scores(pdf_sentences)

	# 4. Detect Contradictions and Hidden Patterns
	report_progress("Detecting contradictions and patterns...", 85)
	contradictions = detect_contradictions(pdf_text, news_articles)
	hidden_patterns = detect_hidden_patterns(reviews)

	# 5. Analyze External Sentiment with ALL data
	report_progress("Analyzing sentiment...", 90)
	news_text = [a['content'] for a in news_articles]
	reviews_text = [r['content'] for r in reviews]
	all_external_text = news_text + reviews_text

	news_sentiment = analyze_sentiment(news_text) if news_text else {'label': 'Neutral', 'score': 0.5}
	reviews_sentiment = analyze_sentiment(reviews_text) if reviews_text else {'label': 'Neutral', 'score': 0.5}

	# Aspect-based sentiment (REAL SCORES)
	emission_sentiment = analyze_aspect_sentiment(all_external_text, EMISSION_KEYWORDS)
	energy_sentiment = analyze_aspect_sentiment(all_external_text, ENERGY_KEYWORDS)
	waste_sentiment = analyze_aspect_sentiment(all_external_text, WASTE_KEYWORDS)

	# 6. Calculate Evidence-Based Score with detailed metrics
	report_progress("Calculating final scores...", 95)

	# Calculate detailed scores (REAL METRICS)
	green_keyword_freq = pdf_scores['env_count'] / max(len(pdf_sentences), 1)
	vague_ratio = calculate_vague_score(pdf_sentences)
	concrete_ratio = calculate_concrete_score(pdf_sentences)

	# --- IMPROVED SCORING FORMULA ---
	# We now calculate the composite sentiment FIRST and let it drive the external portion of the score.
	# See lines 340+ for where we normally calculated it. We'll do it here to affect the score.

	# 1. Internal Sentiment
	internal_sentiment_data = analyze_sentiment(pdf_scores['env_sentences'])

	def get_linear_score_local(s_dict):
	# Convert label+confidence to 0-100 scale
	if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) # 50-100
	if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) # 0-50
	return 50 # Neutral

	s_int = get_linear_score_local(internal_sentiment_data)
	s_ext = get_linear_score_local(news_sentiment)
	s_rev = get_linear_score_local(reviews_sentiment)

	# 2. Composite Sentiment Score (0-100)
	# 35% Internal (What they say) + 45% External (News) + 20% Reviews (Employee/Public)
	composite_score_val = (s_int * 0.35) + (s_ext * 0.45) + (s_rev * 0.20)

	# 3. Base Score Calculation
	# We blend the Composite Sentiment (Qualitative) with Concrete Data (Quantitative)

	# Start with the Sentiment Score (0-100)
	final_score = composite_score_val

	# Adjust based on Concrete Data (The "Proof")
	# If they have high concrete data, boost the score.
	# If they have high vague language, penalize the score.

	score_modifier = 0
	score_modifier += min(concrete_ratio * 100, 25) # Up to +25 points for concrete data
	score_modifier -= min(vague_ratio * 50, 20) # Up to -20 points for vague language

	# Apply modifier
	final_score += score_modifier

	# Contradiction Penalty (Facts Check)
	if contradictions:
	# Heavily penalize for contradictions
	final_score -= (len(contradictions) * 15)

	# Cap at 0-100
	final_score = max(0, min(100, final_score))


	# Calculate external sentiment gap
	ext_gap = abs(news_sentiment['score'] - reviews_sentiment['score'])
	# Determine label
	if final_score >= 80: label = "Excellent"
	elif final_score >= 60: label = "Good"
	elif final_score >= 40: label = "Average"
	elif final_score >= 20: label = "At Risk"
	else: label = "Greenwashing"

	# Determine risk level (3-State System)
	# 2 = Greenwashing (High/Critical)
	# 1 = At Risk (Medium)
	# 0 = No Risk (Low)
	risk_level_code = 0
	risk_reasons = []

	# 1. Contradictions (Immediate Greenwashing)
	if contradictions:
	risk_level_code = 2
	risk_reasons.append("External contradictions found")

	# 2. Score Thresholds
	if final_score < 40:
	risk_level_code = max(risk_level_code, 2)
	risk_reasons.append(f"Critical Sustainability Score ({int(final_score)}/100)")
	elif final_score < 60:
	risk_level_code = max(risk_level_code, 1) # At Risk

	# 3. Vague Language
	if vague_ratio > 0.50 and concrete_ratio < 0.10:
	risk_level_code = 2
	risk_reasons.append("Excessive vague language")
	elif vague_ratio > 0.40 and concrete_ratio < 0.20:
	risk_level_code = max(risk_level_code, 1) # At Risk

	# 4. Empty Claims
	if news_sentiment['label'] == 'Positive' and concrete_ratio < 0.01:
	risk_level_code = 2
	risk_reasons.append("Positive press without concrete data")

	# --- SAFE HARBOR OVERRIDE ---
	high_risk_industries = ['coal', 'oil', 'petroleum', 'mining', 'gas', 'cement', 'steel', 'tobacco', 'power', 'thermal', 'adani']
	is_high_risk = any(ind in company_name.lower() for ind in high_risk_industries)

	pass_safe_harbor = False
	if concrete_ratio > 0.05 and len(contradictions) < 2:
	if is_high_risk:
	if concrete_ratio > 0.20 and emission_sentiment['label'] == 'Positive':
	pass_safe_harbor = True
	else:
	if risk_level_code < 2:
	risk_level_code = 2
	risk_reasons.append("High Risk Industry without exceptional mitigation")
	elif emission_sentiment['label'] != 'Negative':
	pass_safe_harbor = True

	if pass_safe_harbor:
	risk_level_code = 0 # Force No Risk
	if risk_reasons:
	risk_reasons = [f"Risk Mitigated: Sufficient concrete data ({round(concrete_ratio*100, 1)}%) provided."]
	print(f"SAFE HARBOR TRIGGERED for {company_name}")

	# Map code to string
	# IMPACT: User requested specific labels
	if risk_level_code == 2:
	overall_risk_str = "Greenwashing"
	greenwashing_flag = 1
	elif risk_level_code == 1:
	overall_risk_str = "At Risk"
	greenwashing_flag = 0 # It's not "Greenwashing" per se, just risky? Or should flag be 1?
	# Typically "Greenwashing" flag is binary for UI warnings, but let's keep it 1 only for High.
	else:
	overall_risk_str = "No Risk"
	greenwashing_flag = 0

	# Update reasons into result
	if risk_reasons and risk_level_code >= 1:
	pdf_scores['env_sentences'] = [f"[RISK] {r}" for r in risk_reasons] + pdf_scores['env_sentences']

	# --- AI RECOMMENDATIONS & DESCRIPTION GENERATION ---
	company_description = ""
	ai_recommendations = {}

	if pplx_data:
	report_progress("Using insights...", 95)
	company_description = pplx_data.get("description", "Description unavailable.")
	ai_recommendations = pplx_data.get("recommendations", {})
	else:
	# Fallback to Gemini or defaults
	try:
	from .llm_generator import generate_company_description, generate_ai_recommendations
	report_progress("Generating insights...", 98)
	company_description = generate_company_description(company_name)

	pre_result = {
	"greenwashingLabel": greenwashing_flag,
	"internal_documents_analysis": {"major_findings": pdf_scores['env_sentences'][:1]},
	"contradictions_detected": contradictions,
	"external_summary": {"public_sentiment": news_sentiment['label']}
	}
	ai_recommendations = generate_ai_recommendations(company_name, pre_result)
	except Exception as e:
	print(f"AI Generation fallback failed: {e}")
	company_description = f"Analysis of {company_name}'s sustainability practices."
	ai_recommendations = {
	"customers": ["Review sustainability claims"],
	"investors": ["Monitor ESG disclosures"],
	"regulators": ["Standard compliance checks"]
	}

	# --- COMPOSITE SENTIMENT SCORE ---
	# (calculation remains same)
	internal_sentiment = analyze_sentiment(pdf_scores['env_sentences'])

	def get_linear_score(s_dict):
	if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50)
	if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50)
	return 50 # Neutral

	int_s = get_linear_score(internal_sentiment)
	ext_s = get_linear_score(news_sentiment)
	rev_s = get_linear_score(reviews_sentiment)

	composite_score = (int_s * 0.4) + (ext_s * 0.4) + (rev_s * 0.2)
	composite_score_norm = composite_score / 100.0

	# (AI generation already done above - using company_description and ai_recommendations)

	# Update result
	result = {
	"company_name": company_name,
	"company_description": company_description,
	"last_updated": datetime.now().isoformat(),
	"confidence_score": f"High ({len(news_articles) + len(reviews)} sources analyzed)",
	"greenwashingLabel": greenwashing_flag, # 1 if Greenwashing, else 0 (Simplification for some binary UIs)

	"detailed_scores": {
	"green_keyword_frequency": round(green_keyword_freq, 3),
	"vague_keyword_ratio": round(vague_ratio, 3),
	"concrete_claim_ratio": round(concrete_ratio, 3),
	"overall_sentiment": round(composite_score_norm, 3),
	"internal_sentiment": round(internal_sentiment['score'], 3),
	"external_sentiment": round(news_sentiment['score'], 3),
	"external_sentiment_gap": round(ext_gap, 3),
	"emission_sentiment": round(emission_sentiment['score'], 3),
	"energy_sentiment": round(energy_sentiment['score'], 3),
	"waste_sentiment": round(waste_sentiment['score'], 3),
	"relative_focus_score": round(pdf_scores['env_count'] / max(len(pdf_sentences), 1), 3)
	},

	"external_summary": {
	"key_highlights": [
	f"Public Sentiment: {news_sentiment['label']}",
	f"Risk Level: {overall_risk_str}"
	],
	# ...
	"public_sentiment": news_sentiment['label'],
	"recent_news_summary": f"Analysis of {len(news_articles)} articles.",
	"possible_bias": "None",
	"evidence_links": news_articles[:5]
	},

	"internal_documents_analysis": {
	"major_findings": pdf_scores['env_sentences'][:5],
	"compliance_risks": [f"Potential risk: {s[:50]}..." for s in pdf_scores['env_sentences'] if "aims to" in s][:3],
	"performance_indicators": [s for s in pdf_scores['action_sentences'] if "%" in s][:5]
	},

	"risk_assessment": {
	"financial_risk": "High" if risk_level_code == 2 else "Low",
	"reputation_risk": "Critical" if risk_level_code == 2 else ("Medium" if risk_level_code == 1 else "Low"),
	"compliance_risk": "High" if risk_level_code == 2 else "Low",
	"market_risk": "Medium" if final_score < 50 else "Low",
	# IMPACT: 3-State Output
	"overall_risk_level": overall_risk_str
	},

	# ... (rest same) ...
	"opportunities_and_strengths": [
	"Expand concrete data reporting",
	"Address external contradictions explicitly"
	] if risk_level_code >= 1 else [
	"Strong concrete data transparency",
	"Positive external sentiment alignment"
	],

	"reviews_analysis": {
	"sentiment_score": reviews_sentiment['score'],
	"total_reviews_analyzed": len(reviews),
	"review_sources": reviews[:5]
	},

	"recommended_actions": ai_recommendations,

	"hidden_patterns": [
	{"pattern": "Vague Language", "description": "High usage of 'aims to' without dates"}
	] if vague_ratio > 0.4 else []
	}

	report_progress(f"Analysis complete: Score {final_score}/100", 100)
	return result