Spaces:

Tanxshh
/

greenintellect

Sleeping

App Files Files Community

Tanxshh commited on Dec 8, 2025

Commit

02cc7f6

1 Parent(s): 8110699

Deploy GreenIntellect Backend API with ML models and scraping

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +26 -0
.gitattributes +2 -33
Dockerfile +72 -0
README.md +48 -5
all_feature_columns.pkl +3 -0
app/__init__.py +0 -0
app/__pycache__/__init__.cpython-310.pyc +0 -0
app/__pycache__/__init__.cpython-311.pyc +0 -0
app/__pycache__/main.cpython-310.pyc +0 -0
app/__pycache__/main.cpython-311.pyc +0 -0
app/api/__pycache__/endpoints.cpython-310.pyc +0 -0
app/api/__pycache__/endpoints.cpython-311.pyc +0 -0
app/api/endpoints.py +477 -0
app/db/__pycache__/models.cpython-310.pyc +0 -0
app/db/__pycache__/models.cpython-311.pyc +0 -0
app/db/__pycache__/session.cpython-310.pyc +0 -0
app/db/__pycache__/session.cpython-311.pyc +0 -0
app/db/models.py +37 -0
app/db/session.py +20 -0
app/main.py +33 -0
app/services/__pycache__/analysis_engine.cpython-310.pyc +0 -0
app/services/__pycache__/analysis_engine.cpython-311.pyc +0 -0
app/services/__pycache__/hugchat_client.cpython-311.pyc +0 -0
app/services/__pycache__/llm_generator.cpython-311.pyc +0 -0
app/services/__pycache__/ml_logic.cpython-311.pyc +0 -0
app/services/__pycache__/ml_models.cpython-310.pyc +0 -0
app/services/__pycache__/ml_models.cpython-311.pyc +0 -0
app/services/__pycache__/pdf_processor.cpython-310.pyc +0 -0
app/services/__pycache__/pdf_processor.cpython-311.pyc +0 -0
app/services/__pycache__/perplexity_client.cpython-311.pyc +0 -0
app/services/__pycache__/scoring.cpython-310.pyc +0 -0
app/services/__pycache__/scoring.cpython-311.pyc +0 -0
app/services/__pycache__/scraper.cpython-310.pyc +0 -0
app/services/__pycache__/scraper.cpython-311.pyc +0 -0
app/services/analysis_engine.py +425 -0
app/services/hugchat_client.py +54 -0
app/services/llm_generator.py +229 -0
app/services/ml_logic.py +137 -0
app/services/ml_models.py +26 -0
app/services/pdf_processor.py +21 -0
app/services/perplexity_client.py +58 -0
app/services/scoring.py +139 -0
app/services/scraper.py +393 -0
binary_to_report_name_mapping.pkl +3 -0
category_to_greenwashing_mapping.pkl +3 -0
ensemble_model.pkl +3 -0
ml_models/all_feature_columns.pkl +3 -0
ml_models/binary_to_report_name_mapping.pkl +3 -0
ml_models/category_to_greenwashing_mapping.pkl +3 -0
ml_models/ensemble_model.pkl +3 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,26 @@

+# Python
+__pycache__/
+*.pyc
+*.pyo
+venv/
+.venv/
+# Database (created at runtime)
+*.db
+# Uploads (created at runtime)
+uploads/
+# Environment files
+.env
+.env.local
+# IDE
+.vscode/
+.idea/
+# Logs
+*.log
+# Git
+.git/

.gitattributes CHANGED Viewed

@@ -1,35 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Git LFS for large model files
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,72 @@

+# Hugging Face Spaces - GreenIntellect Backend API
+# Python FastAPI + ML Models + Scraping
+FROM python:3.11-slim
+# Create non-root user (required by Hugging Face)
+RUN useradd -m -u 1000 user
+WORKDIR /app
+# Install system dependencies for Selenium/Chromium
+RUN apt-get update && apt-get install -y \
+    curl \
+    wget \
+    gnupg \
+    chromium \
+    chromium-driver \
+    fonts-liberation \
+    libasound2 \
+    libatk-bridge2.0-0 \
+    libatk1.0-0 \
+    libatspi2.0-0 \
+    libcups2 \
+    libdbus-1-3 \
+    libdrm2 \
+    libgbm1 \
+    libgtk-3-0 \
+    libnspr4 \
+    libnss3 \
+    libwayland-client0 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxfixes3 \
+    libxkbcommon0 \
+    libxrandr2 \
+    xdg-utils \
+    && rm -rf /var/lib/apt/lists/*
+# Set Chrome environment variables
+ENV CHROME_BIN=/usr/bin/chromium
+ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
+# Copy and install Python dependencies
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir --timeout=300 -r /app/requirements.txt
+# Copy ML model files
+COPY ensemble_model.pkl /app/ensemble_model.pkl
+COPY all_feature_columns.pkl /app/all_feature_columns.pkl
+COPY binary_to_report_name_mapping.pkl /app/binary_to_report_name_mapping.pkl
+COPY category_to_greenwashing_mapping.pkl /app/category_to_greenwashing_mapping.pkl
+# Copy backend application
+COPY app /app/app
+COPY ml_models /app/ml_models
+# Create directories
+RUN mkdir -p /app/uploads && chown -R user:user /app
+# Switch to non-root user
+USER user
+# Environment variables
+ENV PORT=7860
+ENV HOST=0.0.0.0
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONPATH=/app
+EXPOSE 7860
+# Start FastAPI
+CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,54 @@
 ---
-title: Greenintellect
-emoji: 😻
-colorFrom: blue
-colorTo: green
 sdk: docker
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: GreenIntellect API
+emoji: 🌿
+colorFrom: green
+colorTo: blue
 sdk: docker
 pinned: false
 license: mit
 ---
+# 🌿 GreenIntellect API
+AI-powered API for analyzing sustainability reports and detecting greenwashing.
+## API Endpoints
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/api/` | GET | API health check |
+| `/api/analyze` | POST | Analyze text for greenwashing |
+| `/api/upload` | POST | Upload PDF for analysis |
+| `/api/requests` | GET | Get analysis requests |
+| `/` | GET | API welcome message |
+## Usage
+```python
+import requests
+# Analyze text
+response = requests.post(
+    "https://tanxshh-greenintellect.hf.space/api/analyze",
+    json={"company_name": "Example Corp", "text": "Our sustainable practices..."}
+)
+print(response.json())
+```
+## Features
+- 📄 PDF/Text Analysis
+- 🔍 Greenwashing Detection
+- 📊 Sentiment Analysis
+- 🌐 Web Scraping (News & Reviews)
+- 🤖 AI-powered Insights
+## Technology
+- FastAPI + Python 3.11
+- FinBERT & Sentence Transformers
+- Selenium + Chromium for scraping
+- SQLite Database
+---
+Built with ❤️ for a sustainable future

all_feature_columns.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f0b1ae01441008b1d591702001ef5da622b49120de397b6aefe19131d2fb9cb
+size 219

app/__init__.py ADDED Viewed

File without changes

app/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (113 Bytes). View file

app/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (179 Bytes). View file

app/__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (952 Bytes). View file

app/__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (1.53 kB). View file

app/api/__pycache__/endpoints.cpython-310.pyc ADDED Viewed

Binary file (4.17 kB). View file

app/api/__pycache__/endpoints.cpython-311.pyc ADDED Viewed

Binary file (25.7 kB). View file

app/api/endpoints.py ADDED Viewed

	@@ -0,0 +1,477 @@

+from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException
+from sqlalchemy.orm import Session
+from typing import List
+import shutil
+import os
+import json
+from datetime import datetime
+import csv
+import io
+import time
+import random
+from ..db.session import get_db
+from ..db.models import Company, AnalysisRequest
+from ..services.analysis_engine import analyze_company
+from ..services.ml_logic import predict_greenwashing_risk
+router = APIRouter()
+UPLOAD_DIR = "uploads"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+@router.post("/requests")
+async def create_request(
+    company_name: str = Form(...),
+    file: UploadFile = File(...),
+    db: Session = Depends(get_db)
+):
+    # Save file
+    file_path = os.path.join(UPLOAD_DIR, file.filename)
+    with open(file_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    # Create Request Record (Pending)
+    db_request = AnalysisRequest(
+        user_id="demo-user", # Replace with auth
+        company_name=company_name,
+        document_name=file.filename,
+        document_content=file_path, # Store path temporarily or extract text later
+        status="pending"
+    )
+    db.add(db_request)
+    db.commit()
+    db.refresh(db_request)
+    return db_request
+@router.post("/requests/{id}/approve")
+async def approve_request(id: int, db: Session = Depends(get_db)):
+    db_request = db.query(AnalysisRequest).filter(AnalysisRequest.id == id).first()
+    if not db_request:
+        raise HTTPException(status_code=404, detail="Request not found")
+    if db_request.status != "pending":
+        raise HTTPException(status_code=400, detail="Request already processed")
+    try:
+        # Update status
+        db_request.status = "processing"
+        db.commit()
+        # Run Analysis
+        # Note: document_content currently holds the file path from create_request
+        file_path = db_request.document_content
+        result = await analyze_company(db_request.company_name, file_path)
+        # Update Request
+        db_request.status = "completed"
+        db_request.analysis_result = result
+        # Update or Create Company Record
+        company = db.query(Company).filter(Company.name == db_request.company_name).first()
+        if not company:
+            company = Company(name=db_request.company_name)
+            db.add(company)
+        company.analysis_result = result
+        company.last_analysis_date = datetime.now()
+        db.commit()
+        return result
+    except Exception as e:
+        db_request.status = "failed"
+        db_request.rejection_reason = str(e)
+        db.commit()
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/requests/{id}/reject")
+def reject_request(id: int, reason: str = Form(...), db: Session = Depends(get_db)):
+    db_request = db.query(AnalysisRequest).filter(AnalysisRequest.id == id).first()
+    if not db_request:
+        raise HTTPException(status_code=404, detail="Request not found")
+    # Delete the request
+    db.delete(db_request)
+    db.commit()
+    return {"message": f"Request for {db_request.company_name} rejected and deleted", "reason": reason}
+@router.get("/requests")
+def get_requests(db: Session = Depends(get_db)):
+    return db.query(AnalysisRequest).all()
+@router.get("/companies")
+def get_companies(db: Session = Depends(get_db)):
+    return db.query(Company).all()
+@router.post("/companies/bulk")
+def bulk_import_companies(companies: List[dict], db: Session = Depends(get_db)):
+    """Bulk import companies from CSV or other sources"""
+    imported = []
+    for company_data in companies:
+        # Check if company already exists
+        existing = db.query(Company).filter(Company.name == company_data.get("name")).first()
+        if existing:
+            # Update existing
+            existing.analysis_result = company_data.get("analysis")
+            existing.last_analysis_date = datetime.now()
+            existing.description = company_data.get("description", existing.description)
+            existing.website = company_data.get("website", existing.website)
+            imported.append(existing)
+        else:
+            # Create new
+            new_company = Company(
+                name=company_data.get("name"),
+                description=company_data.get("description", ""),
+                website=company_data.get("website", ""),
+                analysis_result=company_data.get("analysis"),
+                last_analysis_date=datetime.now()
+            )
+            db.add(new_company)
+            imported.append(new_company)
+    db.commit()
+    return {"imported": len(imported), "companies": [c.name for c in imported]}
+@router.get("/company/{id}")
+def get_company(id: int, db: Session = Depends(get_db)):
+    return db.query(Company).filter(Company.id == id).first()
+@router.delete("/companies/all")
+def delete_all_companies(db: Session = Depends(get_db)):
+    """Delete all companies from the database"""
+    count = db.query(Company).delete()
+    db.commit()
+    return {"message": f"Deleted {count} companies"}
+@router.delete("/company/{id}")
+def delete_company(id: int, db: Session = Depends(get_db)):
+    """Delete a specific company by ID"""
+    company = db.query(Company).filter(Company.id == id).first()
+    if not company:
+        raise HTTPException(status_code=404, detail="Company not found")
+    db.delete(company)
+    db.commit()
+    return {"message": f"Deleted company {company.name}"}
+@router.delete("/requests/cleanup")
+def cleanup_requests(db: Session = Depends(get_db)):
+    """Delete requests that are completed, rejected, or failed"""
+    count = db.query(AnalysisRequest).filter(
+        AnalysisRequest.status.in_(["completed", "rejected", "failed"])
+    ).delete(synchronize_session=False)
+    db.commit()
+    return {"message": f"Cleaned up {count} processed requests"}
+@router.delete("/request/{id}")
+def delete_request(id: int, db: Session = Depends(get_db)):
+    """Force delete a request"""
+    req = db.query(AnalysisRequest).filter(AnalysisRequest.id == id).first()
+    if not req:
+        raise HTTPException(status_code=404, detail="Request not found")
+    db.delete(req)
+    db.commit()
+    return {"message": "Request deleted"}
+@router.post("/companies/upload-csv")
+async def upload_companies_csv(file: UploadFile = File(...), db: Session = Depends(get_db)):
+    """
+    Upload CSV for live greenwashing analysis with BATCH AI processing.
+    """
+    if not file.filename.endswith('.csv'):
+        raise HTTPException(status_code=400, detail="Invalid file type. Please upload a CSV.")
+    content = await file.read()
+    decoded = content.decode('utf-8-sig')
+    csv_reader = csv.DictReader(io.StringIO(decoded))
+    if csv_reader.fieldnames:
+        csv_reader.fieldnames = [f.strip().lower() for f in csv_reader.fieldnames]
+    print(f"[DEBUG] CSV Headers found: {csv_reader.fieldnames}")
+    results = []
+    gemini_batch = []
+    batch_size = 10
+    from app.services.perplexity_client import research_company, PERPLEXITY_API_KEY
+    from app.services.llm_generator import generate_batch_insights
+    # Import scoring utilities if not already imported (better to move to top, but here for context)
+    from app.services.scoring import analyze_sentiment, calculate_vague_score, calculate_concrete_score
+    import re
+    # Helper for counting keywords
+    def count_keywords(text: str, keywords: list) -> int:
+        count = 0
+        text_lower = text.lower()
+        for k in keywords:
+            count += len(re.findall(r'\b' + re.escape(k) + r'\b', text_lower))
+        return count
+    # Keyword lists (reused from analysis_engine concept)
+    GREEN_KEYWORDS = ['sustainable', 'eco-friendly', 'green', 'carbon neutral', 'net zero', 'renewable', 'biodegradable']
+    EMISSION_KEYWORDS = ['emission', 'co2', 'carbon']
+    ENERGY_KEYWORDS = ['energy', 'solar', 'wind', 'power']
+    WASTE_KEYWORDS = ['waste', 'recycling', 'plastic']
+    gemini_batch = []
+    batch_size = 10
+    def process_batch_and_save(batch_items):
+        if not batch_items: return
+        # Split batch into AI-needed and Fast-Path
+        ai_needed_items = [item for item in batch_items if not item.get('skip_ai')]
+        fast_path_items = [item for item in batch_items if item.get('skip_ai')]
+        batch_insights = {}
+        # 1. Generate AI Insights ONLY for needed items
+        if ai_needed_items:
+            ai_inputs = [{"name": item['name'], "context": item['context']} for item in ai_needed_items]
+            print(f"Processing batch of {len(ai_inputs)} companies via AI Service...")
+            # Add small delay only if calling AI
+            if len(ai_inputs) > 0:
+                 time.sleep(2)
+            batch_insights = generate_batch_insights(ai_inputs)
+        # 2. Merge and Save (Process both lists)
+        for item in batch_items:
+            name = item['name']
+            if item.get('skip_ai'):
+                # Fast Path Defaults
+                desc = item.get('text')[:500] if item.get('text') else "Imported via CSV (Manual Assessment)"
+                recs = ["Maintain current transparency"] if item['gw_label'] == 0 else ["Improve data disclosure"]
+            else:
+                # AI Results
+                insights = batch_insights.get(name, {})
+                desc = insights.get("description", "AI description pending or unavailable.")
+                recs = insights.get("recommendations", {})
+            # Construct Final Result
+            analysis_result = {
+                "company_name": name,
+                "company_description": desc,
+                "last_updated": datetime.now().isoformat(),
+                "confidence_score": f"{item['prediction']['details'].get('confidence', 'N/A')}% (AI)" if not item.get('skip_ai') else "100% (Manual)",
+                "greenwashingLabel": item['gw_label'],
+                "internal_documents_analysis": {
+                        "major_findings": [
+                            f"Risk Level: {item['final_label_str']}",
+                            f"Reason: {item['reasoning_text']}"
+                        ],
+                        "compliance_risks": [item['reasoning_text']] if item['gw_label'] == 1 else []
+                },
+                "reviews_analysis": {
+                    "employee_tone": "N/A",
+                    "customer_tone": "N/A",
+                    "common_issues": [],
+                    "overall_sentiment_score": f"{int(item['features_dict']['overall_sentiment_score'] * 100)}/100"
+                },
+                "recommended_actions": recs,
+                "external_summary": {
+                    "key_highlights": [f"External Sentiment Gap: {item['features_dict']['external_sentiment_gap']}"],
+                    "public_sentiment": "Mixed" if item['features_dict']['external_sentiment_gap'] > 0.1 else "Positive",
+                    "recent_news_summary": item['reasoning_text'],
+                    "possible_bias": "None",
+                },
+                "risk_assessment": {
+                    "financial_risk": "High" if item['final_label_str'] == "Greenwashing" else "Low",
+                    "reputation_risk": "Critical" if item['final_label_str'] == "Greenwashing" else ("Medium" if item['final_label_str'] == "At Risk" else "Low"),
+                    "compliance_risk": "Medium",
+                    "market_risk": "Low",
+                    "overall_risk_level": item['final_label_str']
+                },
+                "final_company_score": {
+                    "rating_out_of_100": int(item['features_dict']['overall_sentiment_score'] * 100) if item['features_dict']['overall_sentiment_score'] <= 1 else int(item['features_dict']['overall_sentiment_score']),
+                    "label": item['prediction']['model_label']
+                },
+                "detailed_scores": item['features_dict'],
+                "generated_summary": f"Classified as {item['prediction']['model_label']}"
+            }
+            results.append({"name": name, "label": item['gw_label'], "status": f"Processed ({item['final_label_str']})"})
+            # DB Save
+            existing = db.query(Company).filter(Company.name == name).first()
+            if existing:
+                existing.analysis_result = analysis_result
+                existing.last_analysis_date = datetime.now()
+            else:
+                new_company = Company(
+                    name=name,
+                    description=desc,
+                    analysis_result=analysis_result,
+                    last_analysis_date=datetime.now()
+                )
+                db.add(new_company)
+        db.commit()
+    for row in csv_reader:
+        # Flexible column names (normalized)
+        name = row.get('company_name') or row.get('company') or row.get('name')
+        text = row.get('description') or row.get('text') or row.get('claims') or ""
+        if not name:
+            continue
+        # --- FEATURE CALCULATION (If columns missing) ---
+        # 1. Base Sentiment
+        sentiment_res = analyze_sentiment([text] if text else [])
+        overall_sentiment = sentiment_res['score']
+        # 2. Keyword Stats
+        green_freq = float(row.get('green keyword frequecy') or row.get('green keyword frequency') or count_keywords(text, GREEN_KEYWORDS))
+        # 3. Vague/Concrete Scores (Using simple heuristic or scoring func)
+        # Assuming scoring.py has these, if not, fallback to simple version:
+        try:
+             # Basic sentence splitting
+             sentences = [s.strip() for s in text.split('.') if s.strip()]
+             vague_ratio = float(row.get('vague keyword ratio') or calculate_vague_score(sentences))
+             concrete_ratio = float(row.get('concrete cailm ratio') or row.get('concrete claim ratio') or calculate_concrete_score(sentences))
+        except:
+             vague_ratio = 0.2
+             concrete_ratio = 0.3
+        # 4. Aspect Sentiments (Fallback to overall if specific not found)
+        emission_sent = float(row.get('emission sentiment ') or row.get('emission sentiment') or overall_sentiment)
+        energy_sent = float(row.get('energy sentiment') or overall_sentiment)
+        waste_sent = float(row.get('waste sentiment') or overall_sentiment)
+        # EXTRACT FEATURES FOR MODEL (AND FRONTEND DISPLAY)
+        # Naming Verification:
+        # Frontend (Analytics.tsx) expects:
+        # - green_keyword_frequency
+        # - vague_keyword_ratio
+        # - concrete_claim_ratio
+        # - external_sentiment_gap
+        # - emission_sentiment
+        # - energy_sentiment
+        # - waste_sentiment
+        # - relative_focus_score
+        features_dict = {
+            'green_keyword_frequency': green_freq,
+            'vague_keyword_ratio': vague_ratio,
+            'concrete_claim_ratio': concrete_ratio,
+            'overall_sentiment_score': overall_sentiment,
+            'external_sentiment_gap': float(row.get('external_sentiment_gap') or 0.4),
+            'emission_sentiment': emission_sent,
+            'energy_sentiment': energy_sent,
+            'waste_sentiment': waste_sent,
+            'relative_focus_score': float(row.get('relative focus score') or 0.5)
+        }
+        gw_label_raw = row.get('greenwashing_label') or row.get('greenwashing label') or row.get('category')
+        skip_ai = False
+        if gw_label_raw:
+             # Manual label from CSV - TRUST IT (No AI)
+             skip_ai = True
+             final_label_str = str(gw_label_raw).strip()
+             if final_label_str.lower() in ['greenwashing', 'high', 'critical', '1']:
+                 final_label_str = "Greenwashing"; gw_label = 1
+             elif final_label_str.lower() in ['medium', 'at risk']:
+                 final_label_str = "At Risk"; gw_label = 1
+             else:
+                 final_label_str = "No Risk"; gw_label = 0
+             reasoning_text = f"Classified as {final_label_str} based on historical CSV data."
+             # Initialize dummy prediction for compatibility
+             prediction = {
+                 'risk_label': final_label_str,
+                 'greenwashing_risk': gw_label,
+                 'details': {'confidence': 100},
+                 'model_label': final_label_str
+             }
+        else:
+             # AI/Model Prediction (Fallback only if no label)
+             prediction = predict_greenwashing_risk(text, company_name=name, features_dict=features_dict)
+             final_label_str = prediction['risk_label']
+             # Map old AI outputs to new strings just in case
+             if final_label_str == "High" or final_label_str == "Critical": final_label_str = "Greenwashing"
+             elif final_label_str == "Medium": final_label_str = "At Risk"
+             elif final_label_str == "Low": final_label_str = "No Risk"
+             gw_label = 1 if final_label_str in ["Greenwashing", "At Risk"] else 0
+             reasoning_text = f"AI Analysis: Classified as {final_label_str} based on pattern matching."
+             # --- HEURISTIC OVERRIDE (Forcing Sensitivity) ---
+             # If Vague > 0.50 AND not enough concrete data to justify it (>10%)
+             if vague_ratio > 0.50 and concrete_ratio < 0.10:
+                 final_label_str = "Greenwashing"
+                 gw_label = 1
+                 reasoning_text = "Risk High: Excessive vague language without supporting concrete data."
+             elif concrete_ratio < 0.01 and overall_sentiment > 0.6:
+                 final_label_str = "Greenwashing"
+                 gw_label = 1
+                 reasoning_text = "Greenwashing Alert: Positive claims lack concrete evidence."
+        # PERPLEXITY CHECK (Instant Processing for Paid API)
+        pplx_success = False
+        if PERPLEXITY_API_KEY and not skip_ai:
+             pplx_data = research_company(name)
+             if pplx_data:
+                 pplx_success = True
+                 # If Perplexity worked, save immediately and skip batch
+                 # Construct partial item to reuse logic or save directly?
+                 # Saving directly is safer to avoid mixups.
+                 desc = pplx_data.get("description", "AI unavailable")
+                 recs = pplx_data.get("recommendations", {})
+                 if "Controversy" in str(pplx_data.get("findings")): gw_label = 1 # Update risk
+                 # ... (Reuse Construction Logic?) ...
+                 # For brevity, I will just add it to a "processed_item" and call save single?
+                 # Actually, let's just make a fake batch of 1 and reuse the save logic but pass pre-filled data?
+                 # Complexity: High.
+                 # Simplification: Treat Perplexity result as "batch insights" result for a batch of 1.
+                 # Mock batch_insights structure
+                 # Call save logic manually or refactor `process_batch_and_save` to accept external insights?
+                 # Plan: Construct `item` manually, adding 'pplx_insights' key. Update `process_batch` to check for it.
+                 pass
+        # Prepare Context
+        context = f"""
+        Greenwashing Risk: {final_label_str}
+        Reason: {reasoning_text}
+        Sentiment: {features_dict['overall_sentiment_score']:.2f}
+        """
+        item_data = {
+            "name": name,
+            "text": text,
+            "context": context,
+            "prediction": prediction,
+            "features_dict": features_dict,
+            "gw_label": gw_label,
+            "final_label_str": final_label_str,
+            "reasoning_text": reasoning_text,
+            "skip_ai": skip_ai
+        }
+        # Queue for Batch
+        gemini_batch.append(item_data)
+        if len(gemini_batch) >= batch_size:
+            process_batch_and_save(gemini_batch)
+            gemini_batch = []
+    # Final batch
+    if gemini_batch:
+        process_batch_and_save(gemini_batch)
+    return {
+        "message": f"Processed {len(results)} companies using Batch AI Analysis.",
+        "predictions": results
+    }

app/db/__pycache__/models.cpython-310.pyc ADDED Viewed

Binary file (1.48 kB). View file

app/db/__pycache__/models.cpython-311.pyc ADDED Viewed

Binary file (2.47 kB). View file

app/db/__pycache__/session.cpython-310.pyc ADDED Viewed

Binary file (639 Bytes). View file

app/db/__pycache__/session.cpython-311.pyc ADDED Viewed

Binary file (1.03 kB). View file

app/db/models.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from sqlalchemy import Column, Integer, String, Text, JSON, DateTime, ForeignKey, Float
+from sqlalchemy.orm import relationship
+from datetime import datetime
+from .session import Base
+class Company(Base):
+    __tablename__ = "companies"
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, unique=True, index=True)
+    description = Column(Text, nullable=True)
+    website = Column(String, nullable=True)
+    last_analysis_date = Column(DateTime, default=datetime.utcnow)
+    # JSON blobs for structured analysis data
+    analysis_result = Column(JSON, nullable=True)
+    requests = relationship("AnalysisRequest", back_populates="company")
+class AnalysisRequest(Base):
+    __tablename__ = "requests"
+    id = Column(Integer, primary_key=True, index=True)
+    user_id = Column(String, index=True) # Linking to frontend user ID
+    company_name = Column(String)
+    website = Column(String, nullable=True)
+    document_name = Column(String, nullable=True)
+    document_content = Column(Text, nullable=True) # Extracted text from PDF
+    status = Column(String, default="pending") # pending, processing, completed, failed
+    submission_date = Column(DateTime, default=datetime.utcnow)
+    analysis_result = Column(JSON, nullable=True)
+    rejection_reason = Column(String, nullable=True)
+    company_id = Column(Integer, ForeignKey("companies.id"), nullable=True)
+    company = relationship("Company", back_populates="requests")

app/db/session.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+SQLALCHEMY_DATABASE_URL = "sqlite:///./greenintellect.db"
+# SQLALCHEMY_DATABASE_URL = "postgresql://user:password@postgresserver/db"
+engine = create_engine(
+    SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
+)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+Base = declarative_base()
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()

app/main.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from dotenv import load_dotenv
+from .api import endpoints
+from .db.session import engine, Base
+load_dotenv()
+# Create Tables
+# Create Tables
+Base.metadata.create_all(bind=engine)
+app = FastAPI(title="Green Intellect API", version="1.0.0")
+# CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.include_router(endpoints.router, prefix="/api")
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to Green Intellect API"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)

app/services/__pycache__/analysis_engine.cpython-310.pyc ADDED Viewed

Binary file (7.91 kB). View file

app/services/__pycache__/analysis_engine.cpython-311.pyc ADDED Viewed

Binary file (19.6 kB). View file

app/services/__pycache__/hugchat_client.cpython-311.pyc ADDED Viewed

Binary file (2.36 kB). View file

app/services/__pycache__/llm_generator.cpython-311.pyc ADDED Viewed

Binary file (11.3 kB). View file

app/services/__pycache__/ml_logic.cpython-311.pyc ADDED Viewed

Binary file (6.12 kB). View file

app/services/__pycache__/ml_models.cpython-310.pyc ADDED Viewed

Binary file (1.01 kB). View file

app/services/__pycache__/ml_models.cpython-311.pyc ADDED Viewed

Binary file (1.8 kB). View file

app/services/__pycache__/pdf_processor.cpython-310.pyc ADDED Viewed

Binary file (887 Bytes). View file

app/services/__pycache__/pdf_processor.cpython-311.pyc ADDED Viewed

Binary file (1.62 kB). View file

app/services/__pycache__/perplexity_client.cpython-311.pyc ADDED Viewed

Binary file (2.87 kB). View file

app/services/__pycache__/scoring.cpython-310.pyc ADDED Viewed

Binary file (3.67 kB). View file

app/services/__pycache__/scoring.cpython-311.pyc ADDED Viewed

Binary file (7.65 kB). View file

app/services/__pycache__/scraper.cpython-310.pyc ADDED Viewed

Binary file (4.39 kB). View file

app/services/__pycache__/scraper.cpython-311.pyc ADDED Viewed

Binary file (18.4 kB). View file

app/services/analysis_engine.py ADDED Viewed

	@@ -0,0 +1,425 @@

+from datetime import datetime
+from .pdf_processor import extract_text_from_pdf, split_sentences, clean_text
+from .scraper import get_company_news, get_company_reviews, report_progress
+from .scoring import calculate_scores, analyze_sentiment, analyze_aspect_sentiment, calculate_vague_score, calculate_concrete_score
+from .llm_generator import generate_company_description, generate_ai_recommendations
+# Aspect Keywords
+EMISSION_KEYWORDS = ['emission', 'carbon', 'co2', 'greenhouse', 'pollution', 'net zero', 'carbon neutral']
+ENERGY_KEYWORDS = ['energy', 'renewable', 'solar', 'wind', 'electricity', 'fuel', 'power']
+WASTE_KEYWORDS = ['waste', 'recycling', 'plastic', 'circular economy', 'disposal', 'landfill']
+def detect_contradictions(pdf_text, news_articles):
+    """
+    Detect contradictions between company claims (PDF) and external reports (news)
+    Returns list of contradictions with evidence
+    """
+    contradictions = []
+    # Keywords that indicate strong claims
+    claim_keywords = ['committed', 'achieved', 'reduced', 'eliminated', 'carbon neutral', 'net zero', 'sustainable']
+    # Keywords that indicate environmental context (Strict Physical Terms only)
+    # Removed generic words like 'green', 'sustainability', 'environmental' which appear in financial contexts
+    env_context = ['climate', 'carbon', 'emission', 'pollution', 'waste', 'biodiversity', 'fossil fuel', 'deforestation', 'ecological']
+    # Exclude regulators to avoid flagging financial fines as greenwashing
+    # (RBI, SEBI, SEC, etc.)
+    financial_exclusions = ['rbi', 'sebi', 'sec', 'money laundering', 'insider trading', 'stock market', 'shares', 'quarterly result']
+    for article in news_articles:
+        # Check if article is relevant to environment before counting it as a contradiction
+        text = (article['title'] + " " + article['content']).lower()
+        # Safety Check: If it mentions financial regulators/crimes, IGNORE even if it says "Green"
+        if any(ex in text for ex in financial_exclusions):
+            continue
+        if not any(k in text for k in env_context):
+            continue
+        for key in claim_keywords:
+            if key in text and any(neg in text for neg in ['false', 'misleading', 'investigation', 'lawsuit', 'fine', 'violation']):
+               contradictions.append({
+                   "claim_type": "Environmental claim questioned",
+                   "evidence": article['title'],
+                   "source": article['url'],
+                   "risk_level": "High"
+               })
+               break
+    # Keywords that indicate skepticism or allegations
+    skeptic_keywords = ['greenwashing', 'false claims', 'misleading', 'controversy', 'lawsuit', 'allegations']
+    pdf_lower = pdf_text.lower()
+    has_strong_claims = any(keyword in pdf_lower for keyword in claim_keywords)
+    if has_strong_claims:
+        for article in news_articles:
+            content_lower = article['content'].lower()
+            if any(keyword in content_lower for keyword in skeptic_keywords):
+                contradictions.append({
+                    "claim_type": "Environmental commitment",
+                    "evidence_url": article['url'],
+                    "evidence_title": article['title'],
+                    "severity": "High"
+                })
+    # New: General Compliance Risk Detection (Not just contradictions)
+    # Search for specific legal/compliance keywords in all articles
+    compliance_keywords = ['lawsuit', 'fine', 'penalty', 'violation', 'non-compliance', 'EPA', 'investigation', 'fraud', 'illegal']
+    for article in news_articles:
+        content_lower = article['content'].lower()
+        if any(keyword in content_lower for keyword in compliance_keywords):
+             contradictions.append({ # Leveraging the same list for now, or could create a separate list
+                "claim_type": "Regulatory Compliance Issue",
+                "evidence_url": article['url'],
+                "evidence_title": article['title'],
+                "severity": "Critical"
+             })
+    return contradictions
+def detect_hidden_patterns(all_reviews):
+    """
+    Analyze reviews to find hidden patterns:
+    - Sudden changes in sentiment
+    - Repeated phrases (astroturfing)
+    - Discrepancies between employee and customer reviews
+    """
+    patterns = []
+    if len(all_reviews) > 10:
+        # Check for repeated phrases (potential fake reviews)
+        content_texts = [r['content'][:500] for r in all_reviews]
+        unique_ratio = len(set(content_texts)) / len(content_texts)
+        if unique_ratio < 0.7:
+            patterns.append({
+                "pattern": "Potential astroturfing detected",
+                "description": f"Only {int(unique_ratio*100)}% unique review content - may indicate coordinated posting",
+                "severity": "Medium"
+            })
+    # Check for platform discrepancies
+    glassdoor_reviews = [r for r in all_reviews if 'glassdoor' in r['url'].lower()]
+    reddit_reviews = [r for r in all_reviews if 'reddit' in r['url'].lower()]
+    if glassdoor_reviews and reddit_reviews:
+        patterns.append({
+            "pattern": "Multi-platform analysis available",
+            "description": f"Found {len(glassdoor_reviews)} Glassdoor and {len(reddit_reviews)} Reddit discussions for cross-validation",
+            "severity": "Info"
+        })
+    return patterns
+async def analyze_company(company_name: str, pdf_path: str):
+    report_progress(f"Starting comprehensive analysis for {company_name}", 5)
+    # 1. Process PDF
+    report_progress("Processing PDF document...", 8)
+    pdf_text = extract_text_from_pdf(pdf_path)
+    pdf_sentences = split_sentences(pdf_text)
+    # --- PERPLEXITY AI INTEGRATION ---
+    from .perplexity_client import research_company, PERPLEXITY_API_KEY
+    pplx_data = None
+    if PERPLEXITY_API_KEY:
+        report_progress("Conducting deep research...", 15)
+        pplx_data = research_company(company_name)
+    # 2. Comprehensive Scraping (ALL available sources)
+    # Always run scraping to get real news, even if Perplexity is active
+    news_articles = await get_company_news(company_name)
+    # Optional: We can still use Perplexity findings for internal scoring without displaying them as 'news'
+    if pplx_data:
+         pass # Findings already in pplx_data for later use
+    # Progress 50-80% handled by get_company_reviews
+    reviews = await get_company_reviews(company_name)
+    # Progress 50-80% handled by get_company_reviews
+    reviews = await get_company_reviews(company_name)
+    # 3. Analyze PDF Content
+    report_progress("Analyzing PDF content...", 82)
+    pdf_scores = calculate_scores(pdf_sentences)
+    # 4. Detect Contradictions and Hidden Patterns
+    report_progress("Detecting contradictions and patterns...", 85)
+    contradictions = detect_contradictions(pdf_text, news_articles)
+    hidden_patterns = detect_hidden_patterns(reviews)
+    # 5. Analyze External Sentiment with ALL data
+    report_progress("Analyzing sentiment...", 90)
+    news_text = [a['content'] for a in news_articles]
+    reviews_text = [r['content'] for r in reviews]
+    all_external_text = news_text + reviews_text
+    news_sentiment = analyze_sentiment(news_text) if news_text else {'label': 'Neutral', 'score': 0.5}
+    reviews_sentiment = analyze_sentiment(reviews_text) if reviews_text else {'label': 'Neutral', 'score': 0.5}
+    # Aspect-based sentiment (REAL SCORES)
+    emission_sentiment = analyze_aspect_sentiment(all_external_text, EMISSION_KEYWORDS)
+    energy_sentiment = analyze_aspect_sentiment(all_external_text, ENERGY_KEYWORDS)
+    waste_sentiment = analyze_aspect_sentiment(all_external_text, WASTE_KEYWORDS)
+    # 6. Calculate Evidence-Based Score with detailed metrics
+    report_progress("Calculating final scores...", 95)
+    # Calculate detailed scores (REAL METRICS)
+    green_keyword_freq = pdf_scores['env_count'] / max(len(pdf_sentences), 1)
+    vague_ratio = calculate_vague_score(pdf_sentences)
+    concrete_ratio = calculate_concrete_score(pdf_sentences)
+    # --- IMPROVED SCORING FORMULA ---
+    # We now calculate the composite sentiment FIRST and let it drive the external portion of the score.
+    # See lines 340+ for where we normally calculated it. We'll do it here to affect the score.
+    # 1. Internal Sentiment
+    internal_sentiment_data = analyze_sentiment(pdf_scores['env_sentences'])
+    def get_linear_score_local(s_dict):
+        # Convert label+confidence to 0-100 scale
+        if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) # 50-100
+        if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) # 0-50
+        return 50 # Neutral
+    s_int = get_linear_score_local(internal_sentiment_data)
+    s_ext = get_linear_score_local(news_sentiment)
+    s_rev = get_linear_score_local(reviews_sentiment)
+    # 2. Composite Sentiment Score (0-100)
+    # 35% Internal (What they say) + 45% External (News) + 20% Reviews (Employee/Public)
+    composite_score_val = (s_int * 0.35) + (s_ext * 0.45) + (s_rev * 0.20)
+    # 3. Base Score Calculation
+    # We blend the Composite Sentiment (Qualitative) with Concrete Data (Quantitative)
+    # Start with the Sentiment Score (0-100)
+    final_score = composite_score_val
+    # Adjust based on Concrete Data (The "Proof")
+    # If they have high concrete data, boost the score.
+    # If they have high vague language, penalize the score.
+    score_modifier = 0
+    score_modifier += min(concrete_ratio * 100, 25) # Up to +25 points for concrete data
+    score_modifier -= min(vague_ratio * 50, 20)     # Up to -20 points for vague language
+    # Apply modifier
+    final_score += score_modifier
+    # Contradiction Penalty (Facts Check)
+    if contradictions:
+        # Heavily penalize for contradictions
+        final_score -= (len(contradictions) * 15)
+    # Cap at 0-100
+    final_score = max(0, min(100, final_score))
+    # Calculate external sentiment gap
+    ext_gap = abs(news_sentiment['score'] - reviews_sentiment['score'])
+        # Determine label
+    if final_score >= 80: label = "Excellent"
+    elif final_score >= 60: label = "Good"
+    elif final_score >= 40: label = "Average"
+    elif final_score >= 20: label = "At Risk"
+    else: label = "Greenwashing"
+    # Determine risk level (3-State System)
+    # 2 = Greenwashing (High/Critical)
+    # 1 = At Risk (Medium)
+    # 0 = No Risk (Low)
+    risk_level_code = 0
+    risk_reasons = []
+    # 1. Contradictions (Immediate Greenwashing)
+    if contradictions:
+        risk_level_code = 2
+        risk_reasons.append("External contradictions found")
+    # 2. Score Thresholds
+    if final_score < 40:
+        risk_level_code = max(risk_level_code, 2)
+        risk_reasons.append(f"Critical Sustainability Score ({int(final_score)}/100)")
+    elif final_score < 60:
+        risk_level_code = max(risk_level_code, 1) # At Risk
+    # 3. Vague Language
+    if vague_ratio > 0.50 and concrete_ratio < 0.10:
+        risk_level_code = 2
+        risk_reasons.append("Excessive vague language")
+    elif vague_ratio > 0.40 and concrete_ratio < 0.20:
+        risk_level_code = max(risk_level_code, 1) # At Risk
+    # 4. Empty Claims
+    if news_sentiment['label'] == 'Positive' and concrete_ratio < 0.01:
+        risk_level_code = 2
+        risk_reasons.append("Positive press without concrete data")
+    # --- SAFE HARBOR OVERRIDE ---
+    high_risk_industries = ['coal', 'oil', 'petroleum', 'mining', 'gas', 'cement', 'steel', 'tobacco', 'power', 'thermal', 'adani']
+    is_high_risk = any(ind in company_name.lower() for ind in high_risk_industries)
+    pass_safe_harbor = False
+    if concrete_ratio > 0.05 and len(contradictions) < 2:
+        if is_high_risk:
+             if concrete_ratio > 0.20 and emission_sentiment['label'] == 'Positive':
+                 pass_safe_harbor = True
+             else:
+                 if risk_level_code < 2:
+                     risk_level_code = 2
+                     risk_reasons.append("High Risk Industry without exceptional mitigation")
+        elif emission_sentiment['label'] != 'Negative':
+             pass_safe_harbor = True
+    if pass_safe_harbor:
+        risk_level_code = 0 # Force No Risk
+        if risk_reasons:
+             risk_reasons = [f"Risk Mitigated: Sufficient concrete data ({round(concrete_ratio*100, 1)}%) provided."]
+        print(f"SAFE HARBOR TRIGGERED for {company_name}")
+    # Map code to string
+    # IMPACT: User requested specific labels
+    if risk_level_code == 2:
+        overall_risk_str = "Greenwashing"
+        greenwashing_flag = 1
+    elif risk_level_code == 1:
+        overall_risk_str = "At Risk"
+        greenwashing_flag = 0 # It's not "Greenwashing" per se, just risky? Or should flag be 1?
+        # Typically "Greenwashing" flag is binary for UI warnings, but let's keep it 1 only for High.
+    else:
+        overall_risk_str = "No Risk"
+        greenwashing_flag = 0
+    # Update reasons into result
+    if risk_reasons and risk_level_code >= 1:
+        pdf_scores['env_sentences'] = [f"[RISK] {r}" for r in risk_reasons] + pdf_scores['env_sentences']
+    # --- AI RECOMMENDATIONS & DESCRIPTION GENERATION ---
+    company_description = ""
+    ai_recommendations = {}
+    if pplx_data:
+        report_progress("Using insights...", 95)
+        company_description = pplx_data.get("description", "Description unavailable.")
+        ai_recommendations = pplx_data.get("recommendations", {})
+    else:
+        # Fallback to Gemini or defaults
+        try:
+            from .llm_generator import generate_company_description, generate_ai_recommendations
+            report_progress("Generating insights...", 98)
+            company_description = generate_company_description(company_name)
+            pre_result = {
+                "greenwashingLabel": greenwashing_flag,
+                "internal_documents_analysis": {"major_findings": pdf_scores['env_sentences'][:1]},
+                "contradictions_detected": contradictions,
+                "external_summary": {"public_sentiment": news_sentiment['label']}
+            }
+            ai_recommendations = generate_ai_recommendations(company_name, pre_result)
+        except Exception as e:
+            print(f"AI Generation fallback failed: {e}")
+            company_description = f"Analysis of {company_name}'s sustainability practices."
+            ai_recommendations = {
+                "customers": ["Review sustainability claims"],
+                "investors": ["Monitor ESG disclosures"],
+                "regulators": ["Standard compliance checks"]
+            }
+    # --- COMPOSITE SENTIMENT SCORE ---
+    # (calculation remains same)
+    internal_sentiment = analyze_sentiment(pdf_scores['env_sentences'])
+    def get_linear_score(s_dict):
+        if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50)
+        if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50)
+        return 50 # Neutral
+    int_s = get_linear_score(internal_sentiment)
+    ext_s = get_linear_score(news_sentiment)
+    rev_s = get_linear_score(reviews_sentiment)
+    composite_score = (int_s * 0.4) + (ext_s * 0.4) + (rev_s * 0.2)
+    composite_score_norm = composite_score / 100.0
+    # (AI generation already done above - using company_description and ai_recommendations)
+    # Update result
+    result = {
+        "company_name": company_name,
+        "company_description": company_description,
+        "last_updated": datetime.now().isoformat(),
+        "confidence_score": f"High ({len(news_articles) + len(reviews)} sources analyzed)",
+        "greenwashingLabel": greenwashing_flag, # 1 if Greenwashing, else 0 (Simplification for some binary UIs)
+        "detailed_scores": {
+            "green_keyword_frequency": round(green_keyword_freq, 3),
+            "vague_keyword_ratio": round(vague_ratio, 3),
+            "concrete_claim_ratio": round(concrete_ratio, 3),
+            "overall_sentiment": round(composite_score_norm, 3),
+            "internal_sentiment": round(internal_sentiment['score'], 3),
+            "external_sentiment": round(news_sentiment['score'], 3),
+            "external_sentiment_gap": round(ext_gap, 3),
+            "emission_sentiment": round(emission_sentiment['score'], 3),
+            "energy_sentiment": round(energy_sentiment['score'], 3),
+            "waste_sentiment": round(waste_sentiment['score'], 3),
+            "relative_focus_score": round(pdf_scores['env_count'] / max(len(pdf_sentences), 1), 3)
+        },
+        "external_summary": {
+            "key_highlights": [
+                f"Public Sentiment: {news_sentiment['label']}",
+                f"Risk Level: {overall_risk_str}"
+            ],
+            # ...
+            "public_sentiment": news_sentiment['label'],
+            "recent_news_summary": f"Analysis of {len(news_articles)} articles.",
+            "possible_bias": "None",
+            "evidence_links": news_articles[:5]
+        },
+        "internal_documents_analysis": {
+            "major_findings": pdf_scores['env_sentences'][:5],
+            "compliance_risks": [f"Potential risk: {s[:50]}..." for s in pdf_scores['env_sentences'] if "aims to" in s][:3],
+            "performance_indicators": [s for s in pdf_scores['action_sentences'] if "%" in s][:5]
+        },
+        "risk_assessment": {
+            "financial_risk": "High" if risk_level_code == 2 else "Low",
+            "reputation_risk": "Critical" if risk_level_code == 2 else ("Medium" if risk_level_code == 1 else "Low"),
+            "compliance_risk": "High" if risk_level_code == 2 else "Low",
+            "market_risk": "Medium" if final_score < 50 else "Low",
+            # IMPACT: 3-State Output
+            "overall_risk_level": overall_risk_str
+        },
+        # ... (rest same) ...
+        "opportunities_and_strengths": [
+             "Expand concrete data reporting",
+             "Address external contradictions explicitly"
+        ] if risk_level_code >= 1 else [
+             "Strong concrete data transparency",
+             "Positive external sentiment alignment"
+        ],
+        "reviews_analysis": {
+            "sentiment_score": reviews_sentiment['score'],
+            "total_reviews_analyzed": len(reviews),
+            "review_sources": reviews[:5]
+        },
+        "recommended_actions": ai_recommendations,
+        "hidden_patterns": [
+            {"pattern": "Vague Language", "description": "High usage of 'aims to' without dates"}
+        ] if vague_ratio > 0.4 else []
+    }
+    report_progress(f"Analysis complete: Score {final_score}/100", 100)
+    return result

app/services/hugchat_client.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+from hugchat import hugchat
+from hugchat.login import Login
+from dotenv import load_dotenv
+load_dotenv()
+HF_EMAIL = os.getenv("HUGGINGFACE_EMAIL")
+HF_PASS = os.getenv("HUGGINGFACE_PASS")
+# Global variables to reuse login session
+_chatbot = None
+def get_chatbot():
+    global _chatbot
+    if _chatbot:
+        return _chatbot
+    if not HF_EMAIL or not HF_PASS:
+        print("Warning: HUGGINGFACE_EMAIL or HUGGINGFACE_PASS not found.")
+        return None
+    try:
+        sign = Login(HF_EMAIL, HF_PASS)
+        cookies = sign.login()
+        _chatbot = hugchat.ChatBot(cookies=cookies.get_dict())
+        return _chatbot
+    except Exception as e:
+        print(f"HuggingChat Login Error: {e}")
+        return None
+def generate_hugchat_response(prompt: str) -> str:
+    """
+    Generates text using HuggingChat.
+    """
+    chatbot = get_chatbot()
+    if not chatbot:
+        return "AI unavailable (Auth missing)."
+    try:
+        # Create a new conversation for isolation or reuse default
+        id = chatbot.new_conversation()
+        chatbot.change_conversation(id)
+        response = chatbot.chat(prompt)
+        text = response.wait_until_done()
+        # Cleanup? (Optional, but good for privacy)
+        # chatbot.delete_conversation(id)
+        return text
+    except Exception as e:
+        print(f"HuggingChat Error: {e}")
+        return "AI unavailable (Error)."

app/services/llm_generator.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import google.generativeai as genai
+import os
+from dotenv import load_dotenv
+load_dotenv()
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+# if GEMINI_API_KEY:
+#     genai.configure(api_key=GEMINI_API_KEY)
+#     model = genai.GenerativeModel('gemini-2.0-flash')
+# else:
+model = None
+print("Gemini LLM Disabled by user request.")
+#     print("Warning: GEMINI_API_KEY not found in .env. LLM features will be disabled.")
+def generate_company_description(company_name: str) -> str:
+    """
+    Generates a brief 2-3 sentence description of the company using Gemini.
+    """
+    if not model:
+        return "AI description unavailable (API Key missing)."
+    try:
+        prompt = f"Provide a factual, neutral 2-3 sentence description of the company '{company_name}', focusing on its industry and main products. Do not mention sentiment or controversies."
+        response = model.generate_content(prompt)
+        return response.text.strip()
+    except Exception as e:
+        print(f"Error generating description for {company_name}: {e}")
+        return "AI description unavailable due to an error."
+def generate_ai_recommendations(company_name: str, analysis_data: dict) -> dict:
+    """
+    Generates tailored recommendations for Customers, Investors, and Leadership based on the analysis.
+    """
+    if not model:
+        return {
+            "for_customers": ["Review provided evidence links."],
+            "for_investors": ["Analyze financial risks mentioned in report."],
+            "for_company_leadership": ["Address flagged contradictions."]
+        }
+    try:
+        # Construct a summary context for the LLM
+        context = f"""
+        Company: {company_name}
+        Greenwashing Risk: {'High' if analysis_data.get('greenwashingLabel') == 1 else 'Low'}
+        Reason: {analysis_data.get('internal_documents_analysis', {}).get('major_findings', ['N/A'])[0]}
+        Contradictions: {len(analysis_data.get('contradictions_detected', []))} found.
+        Sentiment: {analysis_data.get('external_summary', {}).get('public_sentiment', 'N/A')}
+        """
+        prompt = f"""
+        Based on the following analysis of '{company_name}', provide 3 specific, actionable recommendations for each group (Customers, Investors, Leadership).
+        Focus on greenwashing, transparency, and sustainability accountability.
+        Analysis Context:
+        {context}
+        Output purely as JSON format with keys: "for_customers", "for_investors", "for_company_leadership". Each key should have a list of strings.
+        Do not allow Markdown code blocks. Just raw JSON.
+        """
+        response = model.generate_content(prompt)
+        text = response.text.strip()
+        # Clean potential markdown wrapping
+        if text.startswith("```json"):
+            text = text[7:]
+        if text.endswith("```"):
+            text = text[:-3]
+        import json
+        return json.loads(text)
+    except Exception as e:
+        print(f"Error generating recommendations for {company_name}: {e}")
+        # Fallback
+        return {
+             "for_customers": ["Review provided evidence links.", "Cross-check claims."],
+             "for_investors": ["Monitor reputational risks.", "Demand clearer impact reports."],
+             "for_company_leadership": ["Address detected contradictions.", "Improve transparency."]
+        }
+def generate_combined_insights(company_name: str, analysis_data: dict) -> dict:
+    """
+    Combines description and recommendations into a single API call to reduce rate limit usage.
+    Returns: { "description": str, "recommendations": dict }
+    """
+    if not model:
+        return {
+            "description": "AI description unavailable (API Key missing).",
+            "recommendations": generate_ai_recommendations(company_name, analysis_data) # Fallback to default
+        }
+    try:
+        context = f"""
+        Company: {company_name}
+        Greenwashing Risk: {'High' if analysis_data.get('greenwashingLabel') == 1 else 'Low'}
+        Reason: {analysis_data.get('internal_documents_analysis', {}).get('major_findings', ['N/A'])[0]}
+        """
+        prompt = f"""
+        Analyze '{company_name}' based on this context:
+        {context}
+        Provide 2 outputs in a single JSON object:
+        1. "description": A factual 2-sentence description of the company.
+        2. "recommendations": A dictionary with keys "for_customers", "for_investors", "for_company_leadership", containing 3 actionable tips for each.
+        Output purely JSON. No markdown.
+        """
+        response = model.generate_content(prompt)
+        text = response.text.strip()
+        if text.startswith("```json"): text = text[7:]
+        if text.endswith("```"): text = text[:-3]
+        return json.loads(text)
+    except Exception as e:
+        print(f"Error generating combined insights for {company_name}: {e}")
+        return {
+            "description": "AI description unavailable due to high traffic.",
+            "recommendations": {
+                "for_customers": ["Review evidence links."],
+                "for_investors": ["Analyze risks."],
+                "for_company_leadership": ["Address contradictions."]
+            }
+        }
+def generate_batch_insights(companies_data: list) -> dict:
+    """
+    Generates insights for a batch of companies (up to 10-15 recommended) in a SINGLE prompt.
+    Input: list of {name, context: str}
+    Output: dict { company_name: { "description": ..., "recommendations": ... } }
+    """
+    import json
+    from .hugchat_client import generate_hugchat_response
+    # Try HuggingChat if Gemini is disabled
+    if not model:
+        # Construct Prompt for HuggingChat
+        batch_context = ""
+        for i, c in enumerate(companies_data):
+            batch_context += f"\n--- Company {i+1}: {c['name']} ---\n{c['context']}\n"
+        prompt = f"""
+        You are a sustainability analyst. Analyze these {len(companies_data)} companies.
+        {batch_context}
+        Return a valid JSON OBJECT where keys are company names.
+        For each company, provide:
+        1. "description": A factual 2-sentence summary.
+        2. "recommendations": Object with keys "for_customers", "for_investors", "for_company_leadership" (list of 3 tips each).
+        Example JSON Structure:
+        {{
+            "Company Name": {{
+                "description": "...",
+                "recommendations": {{ "for_customers": [...], ... }}
+            }}
+        }}
+        IMPORTANT: Output ONLY valid JSON. No Markdown. No Intro.
+        """
+        print("Using HuggingChat for Batch Analysis...")
+        response_text = generate_hugchat_response(prompt)
+        try:
+            # clean json
+            text = response_text.strip()
+            if text.startswith("```json"): text = text[7:]
+            if text.endswith("```"): text = text[:-3]
+            if "{" not in text: raise Exception("Invalid JSON format")
+            return json.loads(text)
+        except Exception as e:
+            print(f"HuggingChat Parsing Error: {e}")
+            # Fallthrogh to fallback
+            pass
+    if not model and not 'response_text' in locals():
+        # Return fallback for all
+        return {c['name']: {
+            "description": "AI unavailable (Key missing)",
+            "recommendations": {
+                "for_customers": ["Review evidence."],
+                "for_investors": ["Check risks."],
+                "for_company_leadership": ["Monitor compliance."]
+            }
+        } for c in companies_data}
+    try:
+        # ... (Gemini Logic remains as backup if re-enabled) ...
+        # Construct simplified context list
+        batch_context = ""
+        for i, c in enumerate(companies_data):
+            batch_context += f"\n--- Company {i+1}: {c['name']} ---\n{c['context']}\n"
+        prompt = f"""
+        Analyze the following {len(companies_data)} companies based on the provided contexts.
+        {batch_context}
+        For EACH company, provide:
+        1. "description": A factual 2-sentence summary.
+        2. "recommendations": 3 specific actionable tips per group (Customers, Investors, Leadership).
+        Output purely as a JSON OBJECT where keys are the exact company names and values are the insight objects.
+        Example:
+        {{
+            "Company A": {{ "description": "...", "recommendations": {{ ... }} }},
+            "Company B": ...
+        }}
+        No markdown formatting. Just JSON.
+        """
+        response = model.generate_content(prompt)
+        text = response.text.strip()
+        if text.startswith("```json"): text = text[7:]
+        if text.endswith("```"): text = text[:-3]
+        results = json.loads(text)
+        return results
+    except Exception as e:
+        print(f"Batch generation error: {e}")
+        return {}

app/services/ml_logic.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from .ml_models import ml_models
+from .scoring import calculate_vague_score, calculate_concrete_score, analyze_sentiment
+import re
+import joblib
+import os
+import pandas as pd
+import numpy as np
+# Path configurations
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+MODEL_DIR = os.path.join(BASE_DIR, "ml_models")
+ENSEMBLE_PATH = os.path.join(MODEL_DIR, 'ensemble_model.pkl')
+FEATURE_COLS_PATH = os.path.join(MODEL_DIR, 'all_feature_columns.pkl')
+CAT_MAPPING_PATH = os.path.join(MODEL_DIR, 'category_to_greenwashing_mapping.pkl')
+BINARY_MAPPING_PATH = os.path.join(MODEL_DIR, 'binary_to_report_name_mapping.pkl')
+_ensemble_model = None
+_feature_cols = None
+_binary_mapping = None
+def load_artifacts():
+    global _ensemble_model, _feature_cols, _binary_mapping
+    if _ensemble_model and _feature_cols:
+        return _ensemble_model, _feature_cols, _binary_mapping
+    try:
+        if os.path.exists(ENSEMBLE_PATH):
+            print(f"[ML] Loading Ensemble Model from {ENSEMBLE_PATH}...")
+            _ensemble_model = joblib.load(ENSEMBLE_PATH)
+            _feature_cols = joblib.load(FEATURE_COLS_PATH)
+            if os.path.exists(BINARY_MAPPING_PATH):
+                _binary_mapping = joblib.load(BINARY_MAPPING_PATH)
+            else:
+                # Fallback mapping if file missing
+                _binary_mapping = {0: 'Not Greenwashing (Low)', 1: 'Greenwashing (High/Medium)'}
+            print(f"[ML] Ensemble Model Loaded. Features: {_feature_cols}")
+            return _ensemble_model, _feature_cols, _binary_mapping
+    except Exception as e:
+        print(f"[ML] Failed to load artifacts: {e}")
+    return None, None, None
+def train_model(data: list[dict]):
+    """
+    Legacy training function kept for compatibility but effectively disabled
+    as we are now using the pre-trained Ensemble Model.
+    """
+    print("[ML] Train requested, but system is now using pre-trained Ensemble Model.")
+    return 0.0
+def predict_greenwashing_risk(text, company_name="Unknown", features_dict=None):
+    """
+    Predict greenwashing risk using Ensemble Model if features are provided.
+    Fallback to heuristic if only text is available.
+    """
+    model, features, binary_map = load_artifacts()
+    # 1. Prediction using Ensemble Model (Feature-based)
+    if model and features and features_dict:
+        try:
+            # Prepare input dataframe with correct column order
+            input_data = {}
+            for col in features:
+                # Handle typo in specific user column "frequecy"
+                val = features_dict.get(col)
+                if val is None:
+                    # Fallback for known variations
+                    if col == 'Green Keyword frequecy':
+                         val = features_dict.get('Green Keyword Frequency', 0)
+                    elif col == 'Emission Sentiment ': # Note space
+                         val = features_dict.get('Emission Sentiment', 0)
+                    else:
+                         val = 0
+                input_data[col] = [float(val)]
+            df = pd.DataFrame(input_data)
+            # Predict
+            pred_binary = model.predict(df)[0]
+            pred_proba = model.predict_proba(df)[0] # [prob_0, prob_1]
+            prob_gw = pred_proba[1]
+            # granular mapping based on probability
+            if prob_gw >= 0.75:
+                risk_label = "High"
+                label_text = "High Risk"
+            elif prob_gw >= 0.35:
+                risk_label = "Medium"
+                label_text = "Medium Risk"
+            else:
+                risk_label = "Low"
+                label_text = "Low Risk"
+            return {
+                "company_name": company_name,
+                "greenwashing_score": round(prob_gw, 3),
+                "risk_label": risk_label,
+                "model_label": risk_label, # Use simple label for UI mapping
+                "details": {
+                    "model_used": "Ensemble Voting Classifier",
+                    "confidence": round(max(pred_proba) * 100, 1),
+                    "features": features_dict # Return original features for UI
+                }
+            }
+        except Exception as e:
+            print(f"[ML] Ensemble prediction failed: {e}")
+            # Fallback to heuristic below
+    # 2. Heuristic Fallback (Text-based)
+    sentences = re.split(r'(?<=[.!?]) +', text)
+    vague_score = calculate_vague_score(sentences)
+    concrete_score = calculate_concrete_score(sentences)
+    sentiment = analyze_sentiment([text])
+    risk_score = 0.5 + (vague_score * 0.4) - (concrete_score * 0.5)
+    if sentiment['label'] == 'Negative':
+        risk_score += sentiment['score'] * 0.2
+    risk_score = max(0, min(1, risk_score))
+    return {
+        "company_name": company_name,
+        "greenwashing_score": round(risk_score, 3),
+        "risk_label": "High Risk" if risk_score > 0.7 else "Low Risk",
+        "model_label": "Heuristic Analysis",
+        "details": {
+            "vague_language_ratio": round(vague_score, 3),
+            "concrete_claims_ratio": round(concrete_score, 3),
+            "model_used": "Heuristic Fallback"
+        }
+    }

app/services/ml_models.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+import torch
+class MLModels:
+    _instance = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super(MLModels, cls).__new__(cls)
+            cls._instance.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+            print(f"Loading models on {cls._instance.device}...")
+            # Load Sentence Transformer
+            cls._instance.st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=cls._instance.device)
+            # Load FinBERT for sentiment
+            cls._instance.finbert = pipeline("text-classification", model="yiyanghkust/finbert-tone", device=0 if cls._instance.device == 'cuda' else -1)
+            # Load ClimateBERT for ESG sentiment (optional, can be heavy)
+            # cls._instance.climatebert = pipeline("text-classification", model="climatebert/distilroberta-base-climate-sentiment", device=0 if cls._instance.device == 'cuda' else -1)
+            print("Models loaded successfully.")
+        return cls._instance
+ml_models = MLModels()

app/services/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import fitz  # PyMuPDF
+import re
+def extract_text_from_pdf(pdf_path: str) -> str:
+    text = ""
+    try:
+        with fitz.open(pdf_path) as doc:
+            for page in doc:
+                text += page.get_text()
+    except Exception as e:
+        print(f"Error reading PDF {pdf_path}: {e}")
+        return ""
+    return text
+def split_sentences(text: str) -> list[str]:
+    # Simple regex split on punctuation followed by space
+    return re.split(r'(?<=[.!?])\s+', text)
+def clean_text(text: str) -> str:
+    text = re.sub(r"\s+", " ", str(text)).strip()
+    return text

app/services/perplexity_client.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+import requests
+import json
+from dotenv import load_dotenv
+load_dotenv()
+PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
+def research_company(company_name: str) -> dict:
+    """
+    Uses Perplexity AI to conduct deep web research on a company's environmental impact.
+    Returns: { "description": str, "findings": list, "sentiment": str, "citations": list }
+    """
+    if not PERPLEXITY_API_KEY:
+        print("Warning: PERPLEXITY_API_KEY not found.")
+        return None
+    url = "https://api.perplexity.ai/chat/completions"
+    # Prompt designed to extract structured data compatible with our existing analysis
+    system_prompt = "You are an environmental analyst. Research the target company and return a JSON object with: 'description' (factual summaries), 'findings' (list of 5 key controversies or achievements), 'sentiment' (Positive/Negative/Mixed), 'citations' (list of source URLs), and 'recommendations' (object with keys 'for_customers', 'for_investors', 'for_company_leadership', each a list of 3 strings)."
+    user_prompt = f"Research the environmental track record of '{company_name}'. Focus on emissions, greenwashing, and sustainability 2023-2025."
+    payload = {
+        "model": "sonar",
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ],
+        "temperature": 0.2
+    }
+    headers = {
+        "Authorization": f"Bearer {PERPLEXITY_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    try:
+        response = requests.post(url, json=payload, headers=headers)
+        response.raise_for_status()
+        result = response.json()
+        content = result['choices'][0]['message']['content']
+        citations = result.get('citations', [])
+        # Clean JSON markdown if present
+        if content.startswith("```json"): content = content[7:]
+        if content.endswith("```"): content = content[:-3]
+        data = json.loads(content)
+        data['citations'] = citations # Ensure citations are attached
+        return data
+    except Exception as e:
+        print(f"Perplexity API Error: {e}")
+        return None

app/services/scoring.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import re
+from sentence_transformers import util
+from .ml_models import ml_models
+# Reference phrases
+ENV_REF = [
+    "environment", "climate change", "carbon emissions", "pollution", "waste",
+    "green energy", "renewable resources", "sustainability", "biodiversity",
+    "eco-friendly", "net zero", "solar energy", "wind energy", "water conservation"
+]
+ESG_REF = [
+    "environment", "social responsibility", "governance", "sustainability", "carbon emissions",
+    "green energy", "renewable resources", "waste management", "climate change", "pollution control",
+    "biodiversity", "eco-friendly", "net zero", "solar energy", "wind energy", "water conservation",
+    "community development", "employee welfare", "diversity", "ethics"
+]
+ACTION_REF = [
+    "implemented", "adopted", "reduced emissions", "recycled", "renewable energy",
+    "sustainability project", "steps taken to reduce carbon emissions",
+    "initiatives to help the environment", "measures to prevent greenwashing"
+]
+CLAIM_REF = [
+    "plans to achieve", "committed to", "targets", "pledges", "goal", "aims to",
+    "intent to reduce", "objective to be", "aims for sustainability",
+    "pledged to achieve", "will reduce carbon", "expect to reach net zero",
+    "plans to be carbon neutral by", "commitment to net zero by",
+    "goal to be eco friendly by", "target year for sustainability",
+    "striving to be net zero", "intends to adopt renewable energy", "aiming for eco-friendly operations"
+]
+def semantic_matches(sentences, reference, threshold=0.55, batch_size=64):
+    model = ml_models.st_model
+    ref_emb = model.encode(reference, convert_to_tensor=True)
+    matches = []
+    # Process in batches
+    for i in range(0, len(sentences), batch_size):
+        batch = sentences[i:i+batch_size]
+        if not batch: continue
+        sent_emb = model.encode(batch, convert_to_tensor=True)
+        sim_matrix = util.cos_sim(sent_emb, ref_emb)
+        for j, sim_scores in enumerate(sim_matrix):
+            if sim_scores.max().item() >= threshold:
+                matches.append(batch[j].strip())
+    return matches if matches else []
+def calculate_scores(sentences):
+    env_sentences = semantic_matches(sentences, ENV_REF)
+    esg_sentences = semantic_matches(sentences, ESG_REF)
+    action_sentences = semantic_matches(sentences, ACTION_REF)
+    claim_sentences = semantic_matches(sentences, CLAIM_REF, threshold=0.54)
+    return {
+        "env_count": len(env_sentences),
+        "esg_count": len(esg_sentences),
+        "action_count": len(action_sentences),
+        "claim_count": len(claim_sentences),
+        "env_sentences": env_sentences,
+        "action_sentences": action_sentences
+    }
+def calculate_vague_score(sentences):
+    """
+    Calculate the ratio of sentences containing vague/future-tense language.
+    """
+    vague_patterns = [
+        r"aim(s|ing)? to", r"plan(s|ning)? to", r"committed to", r"strive(s|ing)? for",
+        r"intend(s|ing)? to", r"goal of", r"vision", r"hopefully", r"aspire(s|ing)? to",
+        r"future", r"potential", r"believe"
+    ]
+    regex = re.compile("|".join(vague_patterns), re.IGNORECASE)
+    count = 0
+    for sent in sentences:
+        if regex.search(sent):
+            count += 1
+    return count / max(len(sentences), 1)
+def calculate_concrete_score(sentences):
+    """
+    Calculate the ratio of sentences containing specific, concrete metrics.
+    Looking for numbers followed by %, $, tons, kg, or years.
+    """
+    concrete_patterns = [
+        r"\d+(\.\d+)?%",  # Percentages
+        r"\$\d+",         # Money
+        r"\d+ (tons|kg|metric tons|tonnes)", # Weight
+        r"by 20\d{2}",    # Years (e.g. by 2030)
+        r"reduced by", r"achieved", r"completed" # Past tense concrete verbs
+    ]
+    regex = re.compile("|".join(concrete_patterns), re.IGNORECASE)
+    count = 0
+    for sent in sentences:
+        if regex.search(sent):
+            count += 1
+    return count / max(len(sentences), 1)
+def analyze_sentiment(text_chunks):
+    # Use FinBERT for sentiment
+    results = []
+    for chunk in text_chunks:
+        # Truncate to 1500 chars (approx 300-400 tokens) to be safe
+        if len(chunk) > 1500: chunk = chunk[:1500]
+        try:
+            res = ml_models.finbert(chunk, truncation=True, max_length=512)
+            results.append(res[0]) # [{'label': 'Positive', 'score': 0.9}]
+        except Exception as e:
+            print(f"Sentiment error: {e}")
+    # Aggregate
+    if not results: return {"label": "Neutral", "score": 0.5}
+    pos = sum(1 for r in results if r['label'] == 'Positive')
+    neg = sum(1 for r in results if r['label'] == 'Negative')
+    neu = sum(1 for r in results if r['label'] == 'Neutral')
+    total = len(results)
+    if pos > neg and pos > neu: return {"label": "Positive", "score": pos/total}
+    if neg > pos and neg > neu: return {"label": "Negative", "score": neg/total}
+    return {"label": "Neutral", "score": neu/total}
+def analyze_aspect_sentiment(text_chunks, aspect_keywords):
+    """
+    Analyze sentiment only for chunks containing specific keywords
+    """
+    aspect_chunks = []
+    for chunk in text_chunks:
+        if any(keyword in chunk.lower() for keyword in aspect_keywords):
+            aspect_chunks.append(chunk)
+    if not aspect_chunks:
+        return {"label": "Neutral", "score": 0.5}
+    return analyze_sentiment(aspect_chunks)

app/services/scraper.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import asyncio
+import os
+import requests
+import logging
+from fake_useragent import UserAgent
+try:
+    from ddgs import DDGS
+except ImportError:
+    from duckduckgo_search import DDGS
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium_stealth import stealth
+from webdriver_manager.chrome import ChromeDriverManager
+from bs4 import BeautifulSoup
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+ua = UserAgent()
+# Progress tracking
+progress_callback = None
+def set_progress_callback(callback):
+    """Set a callback function to report progress"""
+    global progress_callback
+    progress_callback = callback
+def report_progress(message, percentage):
+    """Report progress if callback is set"""
+    if progress_callback:
+        progress_callback(message, percentage)
+    print(f"[{percentage}%] {message}")
+def setup_selenium_driver():
+    """Setup a stealth Selenium driver with HuggingFace/Docker compatibility"""
+    options = Options()
+    options.add_argument("--headless=new")  # New headless mode
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    options.add_argument("--disable-gpu")
+    options.add_argument("--disable-extensions")
+    options.add_argument("--disable-infobars")
+    options.add_argument("--window-size=1920,1080")
+    options.add_argument(f"user-agent={ua.random}")
+    options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    options.add_experimental_option('useAutomationExtension', False)
+    # Check if running in Docker/HuggingFace environment
+    is_docker = os.path.exists("/.dockerenv") or os.environ.get("HF_SPACE_ID")
+    driver = None
+    if is_docker:
+        logger.info("Running in Docker/HuggingFace environment, using system Chromium")
+        # Use system Chromium in Docker
+        chromium_paths = ["/usr/bin/chromium", "/usr/bin/chromium-browser", "/usr/bin/google-chrome"]
+        chromedriver_paths = ["/usr/bin/chromedriver", "/usr/local/bin/chromedriver"]
+        for chromium_path in chromium_paths:
+            if os.path.exists(chromium_path):
+                options.binary_location = chromium_path
+                logger.info(f"Using Chromium at: {chromium_path}")
+                break
+        try:
+            # Try with system chromedriver first
+            for chromedriver_path in chromedriver_paths:
+                if os.path.exists(chromedriver_path):
+                    service = Service(chromedriver_path)
+                    driver = webdriver.Chrome(service=service, options=options)
+                    logger.info(f"Using chromedriver at: {chromedriver_path}")
+                    break
+            if driver is None:
+                # Fallback to webdriver_manager
+                service = Service(ChromeDriverManager().install())
+                driver = webdriver.Chrome(service=service, options=options)
+        except Exception as e:
+            logger.error(f"Docker Chrome setup failed: {e}")
+            # Final fallback - try default Chrome
+            try:
+                driver = webdriver.Chrome(options=options)
+            except Exception as e2:
+                logger.error(f"All Chrome drivers failed: {e2}")
+                raise
+    else:
+        # Local development - use webdriver_manager
+        try:
+            service = Service(ChromeDriverManager().install())
+            driver = webdriver.Chrome(service=service, options=options)
+        except Exception as e:
+            logger.error(f"Failed to initialize Chrome driver with manager: {e}")
+            driver = webdriver.Chrome(options=options)
+    # Apply stealth settings
+    stealth(driver,
+            languages=["en-US", "en"],
+            vendor="Google Inc.",
+            platform="Win32",
+            webgl_vendor="Intel Inc.",
+            renderer="Intel Iris OpenGL Engine",
+            fix_hairline=True,
+            )
+    return driver
+async def scrape_url_selenium(url):
+    """Scrape a URL using Selenium Stealth for better evasion"""
+    logger.info(f"Scraping with Selenium: {url}")
+    try:
+        def _selenium_task():
+            driver = setup_selenium_driver()
+            try:
+                driver.get(url)
+                # Wait for some content (simple sleep for now, could be improved with WebDriverWait)
+                import time
+                time.sleep(3)
+                content = driver.page_source
+                return content
+            finally:
+                driver.quit()
+        content = await asyncio.to_thread(_selenium_task)
+        # Parse with BS4 to get clean text
+        soup = BeautifulSoup(content, 'html.parser')
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+        text = soup.get_text(separator=' ', strip=True)
+        return text, content
+    except Exception as e:
+        logger.error(f"Selenium scraping failed for {url}: {e}")
+        return "", ""
+async def search_web(query, max_results=5):
+    """
+    Search the web using DuckDuckGo (no API key required)
+    """
+    try:
+        results = []
+        # specific implementation for DuckDuckGo might need sync wrapper if library is sync-only
+        # DDGS().text() is synchronous generator
+        def run_search():
+            with DDGS() as ddgs:
+                return list(ddgs.text(query, max_results=max_results))
+        # Run sync search in thread
+        search_results = await asyncio.to_thread(run_search)
+        for res in search_results:
+            results.append({
+                "title": res.get('title', ''),
+                "url": res.get('href', ''),
+                "content": res.get('body', ''),
+                "query_type": "web_search"
+            })
+        return results
+    except Exception as e:
+        print(f"Search error for '{query}': {e}")
+        return []
+async def get_news_from_api(company_name):
+    """
+    Use NewsAPI for reliable news collection
+    """
+    api_key = os.getenv('NEWS_API_KEY')
+    if not api_key:
+        return []
+    try:
+        url = f"https://newsapi.org/v2/everything"
+        params = {
+            'q': f'{company_name} AND (sustainability OR greenwashing OR ESG OR environmental)',
+            'language': 'en',
+            'sortBy': 'relevancy',
+            'pageSize': 15,
+            'apiKey': api_key
+        }
+        # Requests is blocking, so we run it in a thread to verify
+        response = await asyncio.to_thread(requests.get, url, params=params, timeout=10)
+        data = response.json()
+        if data.get('status') == 'ok':
+            articles = []
+            for article in data.get('articles', []):
+                # Filter out removed content
+                if article.get('title') == '[Removed]': continue
+                # KEYWORD FILTERS (Same as Web Search)
+                title_lower = (article.get('title') or "").lower()
+                desc_lower = (article.get('description') or "").lower()
+                text_to_check = title_lower + " " + desc_lower
+                # 1. NEGATIVE FILTER: Exclude crime/fraud
+                bad_keywords = ["fraud", "arrest", "scam", "police", "laundering", "jail", "cbi", "ed", "bribe", "punish", "litigation"]
+                if any(bad in title_lower for bad in bad_keywords):
+                    continue
+                # 2. POSITIVE FILTER: Must have ESG context (If query logic fails)
+                # NewsAPI query already has keywords, but let's double check to be safe
+                pass # Relying on API query "AND (sustainability OR ...)" for now
+                articles.append({
+                    'url': article.get('url', ''),
+                    'title': article.get('title', ''),
+                    'content': (article.get('description') or '') + ' ' + (article.get('content') or ''),
+                    'query_type': 'news_api'
+                })
+            return articles
+    except Exception as e:
+        print(f"NewsAPI error: {e}")
+    return []
+# Helper for Filtering
+def is_valid_result(res):
+    """Filter out navigational, login, and irrelevant links"""
+    url = res.get('url', '').lower()
+    title = res.get('title', '').lower()
+    content = res.get('content', '').lower()
+    # 1. Exclude generic Google/Navigational links
+    invalid_domains = ['google.com/search', 'google.com/url', 'accounts.google.com', 'support.google.com',
+                       'youtube.com', 'facebook.com', 'twitter.com/login', 'linkedin.com/login']
+    # 2. Exclude actions
+    invalid_terms = ['sign in', 'log in', 'forgot password', 'download', 'captcha', 'security check', 'robot', 'access denied']
+    if any(d in url for d in invalid_domains): return False
+    if any(t in title for t in invalid_terms): return False
+    # 3. Minimum content length/quality (for reviews)
+    # if len(content) < 20: return False # Optional rule
+    return True
+async def get_company_news(company_name):
+    """Get news using NewsAPI and DuckDuckGo Fallback"""
+    report_progress(f"Starting news collection for {company_name}", 10)
+    articles = []
+    # 1. Try NewsAPI (Limit increased to 20)
+    report_progress("Checking NewsAPI...", 15)
+    api_articles = await get_news_from_api(company_name)
+    articles.extend(api_articles)
+    # 2. Add Web Search (DuckDuckGo) for deeper coverage
+    report_progress("Fetching additional news via Web Search...", 25)
+    queries = [
+        f'"{company_name}" environmental impact report news',
+        f'"{company_name}" greenwashing controversy scandal',
+        f'"{company_name}" sustainability goals criticism',
+        f'"{company_name}" ESG rating news detected',
+        f'"{company_name}" climate change commitments review'
+    ]
+    # ESG/Climate Keywords (Refined to avoid generic matches)
+    ESG_KEYWORDS = [
+        "climate", "carbon", "emission", "pollution", "sustainability", "esg",
+        "renewable", "net zero", "biodiversity", "ecological", "greenhouse", "fossil fuel"
+    ]
+    # "green" and "environment" removed as they match "green light", "business environment"
+    # Negative Keywords to exclude financial crime/generic news
+    NEGATIVE_KEYWORDS = ["fraud", "arrest", "scam", "police", "laundering", "jail", "cbi", "ed", "bribe"]
+    for query in queries:
+        if len(articles) >= 20: break
+        results = await search_web(query, max_results=5)
+        for res in results:
+            if not is_valid_result(res): continue
+            # Combine Title + Body for checking
+            text_to_check = (res.get('title', '') + " " + res.get('body', '')).lower()
+            title_lower = res.get('title', '').lower()
+            # 1. NEGATIVE FILTER: Exclude crime/fraud immediately
+            if any(bad in title_lower for bad in NEGATIVE_KEYWORDS):
+                continue
+            # 2. POSITIVE FILTER: Must have ESG context
+            # Re-adding "environmental" specifically (not just environment)
+            if "environmental" in text_to_check: pass
+            elif not any(k in text_to_check for k in ESG_KEYWORDS):
+                continue # Skip if no environmental context found
+            # Simple de-duplication
+            if not any(a['url'] == res['url'] for a in articles):
+                articles.append(res)
+    report_progress(f"News collection complete: {len(articles)} articles", 45)
+    return articles[:20]
+async def get_company_reviews(company_name):
+    """Get reviews using Web Search (Glassdoor, Reddit, etc.)"""
+    report_progress(f"Starting review collection for {company_name}", 50)
+    reviews = []
+    # Using site: operators to force specific sources
+    queries = [
+        f'site:glassdoor.com "{company_name}" reviews "environment" OR "sustainability"',
+        f'site:reddit.com "{company_name}" greenwashing OR "toxic"',
+        f'site:trustpilot.com "{company_name}" environment',
+        f'"{company_name}" employee reviews sustainability ethics',
+        f'"{company_name}" environmental controversy reviews', # Broad fallback
+        f'"{company_name}" corporate responsibility feedback'   # Broad fallback
+    ]
+    total_queries = len(queries)
+    for idx, query in enumerate(queries):
+        progress = 50 + (idx / total_queries) * 30
+        report_progress(f"Searching specific reviews: {query}", int(progress))
+        results = await search_web(query, max_results=8)
+        for res in results:
+            if len(reviews) >= 40: break
+            if not is_valid_result(res): continue # FILTER HERE
+            # RELEVANCE CHECK (Strict)
+            # Ensure company name is actually mentioned in title or snippet
+            c_name_lower = company_name.lower()
+            res_content = (res.get('title', '') + " " + res.get('content', '')).lower()
+            # Simple substring match (can be improved with fuzzy later if needed)
+            if c_name_lower not in res_content and c_name_lower.split()[0] not in res_content:
+                # Try strict full name, then at least first word (e.g. "Google" in "Google Inc")
+                # But careful with generic first words like "The" or "Green"
+                if len(c_name_lower.split()[0]) > 3:
+                     if c_name_lower.split()[0] not in res_content:
+                         print(f"Skipping unrelated result: {res['title']}")
+                         continue
+                else:
+                     continue # Too short, require full name match
+            # Determine source type based on URL
+            source = "web"
+            if "glassdoor" in res['url']: source = "Glassdoor"
+            elif "twitter" in res['url'] or "x.com" in res['url']: source = "Twitter"
+            elif "linkedin" in res['url']: source = "LinkedIn"
+            elif "reddit" in res['url']: source = "Reddit"
+            elif "trustpilot" in res['url']: source = "Trustpilot"
+            # Clean title
+            title = res['title'].replace(" | Glassdoor", "").replace(" | Reddit", "")
+            reviews.append({
+                "url": res['url'],
+                "title": title,
+                "content": res['content'], # Use the snippet as the review content
+                "source_type": source
+            })
+        await asyncio.sleep(1)
+    # If few reviews found, try a broader fallback
+    if len(reviews) < 3:
+         report_progress("Few reviews found, trying specific broader query...", 75)
+         fallback_results = await search_web(f'"{company_name}" reviews environment', max_results=5)
+         for res in fallback_results:
+             if is_valid_result(res) and not any(r['url'] == res['url'] for r in reviews):
+                 # RELEVANCE CHECK
+                 c_name_lower = company_name.lower()
+                 res_content = (res.get('title', '') + " " + res.get('content', '')).lower()
+                 if c_name_lower not in res_content and c_name_lower.split()[0] not in res_content:
+                     if len(c_name_lower.split()[0]) > 3:
+                        if c_name_lower.split()[0] not in res_content: continue
+                     else: continue
+                 reviews.append({
+                    "url": res['url'],
+                    "title": res['title'],
+                    "content": res['content'],
+                    "source_type": "Web Search"
+                })
+    report_progress(f"Review collection complete: {len(reviews)} reviews", 80)
+    return reviews
+    # NO MOCK DATA FALLBACK
+    return reviews

binary_to_report_name_mapping.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11dd0280ff81b2d788bfdd2a3a44071c0b1ef7c8747e82c39220e3a776a9c2a1
+size 74

category_to_greenwashing_mapping.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a24eec4ecfb676159ea79d3f645867058917b1655d594351f7d049c9b51c6740
+size 44

ensemble_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:354f730c99ba19e50a0e0a26bfd214906485401866b8c748995ba10d66b19fc6
+size 246560

ml_models/all_feature_columns.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f0b1ae01441008b1d591702001ef5da622b49120de397b6aefe19131d2fb9cb
+size 219

ml_models/binary_to_report_name_mapping.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11dd0280ff81b2d788bfdd2a3a44071c0b1ef7c8747e82c39220e3a776a9c2a1
+size 74

ml_models/category_to_greenwashing_mapping.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a24eec4ecfb676159ea79d3f645867058917b1655d594351f7d049c9b51c6740
+size 44

ml_models/ensemble_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:354f730c99ba19e50a0e0a26bfd214906485401866b8c748995ba10d66b19fc6
+size 246560