Tanxshh commited on
Commit
02cc7f6
·
1 Parent(s): 8110699

Deploy GreenIntellect Backend API with ML models and scraping

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +26 -0
  2. .gitattributes +2 -33
  3. Dockerfile +72 -0
  4. README.md +48 -5
  5. all_feature_columns.pkl +3 -0
  6. app/__init__.py +0 -0
  7. app/__pycache__/__init__.cpython-310.pyc +0 -0
  8. app/__pycache__/__init__.cpython-311.pyc +0 -0
  9. app/__pycache__/main.cpython-310.pyc +0 -0
  10. app/__pycache__/main.cpython-311.pyc +0 -0
  11. app/api/__pycache__/endpoints.cpython-310.pyc +0 -0
  12. app/api/__pycache__/endpoints.cpython-311.pyc +0 -0
  13. app/api/endpoints.py +477 -0
  14. app/db/__pycache__/models.cpython-310.pyc +0 -0
  15. app/db/__pycache__/models.cpython-311.pyc +0 -0
  16. app/db/__pycache__/session.cpython-310.pyc +0 -0
  17. app/db/__pycache__/session.cpython-311.pyc +0 -0
  18. app/db/models.py +37 -0
  19. app/db/session.py +20 -0
  20. app/main.py +33 -0
  21. app/services/__pycache__/analysis_engine.cpython-310.pyc +0 -0
  22. app/services/__pycache__/analysis_engine.cpython-311.pyc +0 -0
  23. app/services/__pycache__/hugchat_client.cpython-311.pyc +0 -0
  24. app/services/__pycache__/llm_generator.cpython-311.pyc +0 -0
  25. app/services/__pycache__/ml_logic.cpython-311.pyc +0 -0
  26. app/services/__pycache__/ml_models.cpython-310.pyc +0 -0
  27. app/services/__pycache__/ml_models.cpython-311.pyc +0 -0
  28. app/services/__pycache__/pdf_processor.cpython-310.pyc +0 -0
  29. app/services/__pycache__/pdf_processor.cpython-311.pyc +0 -0
  30. app/services/__pycache__/perplexity_client.cpython-311.pyc +0 -0
  31. app/services/__pycache__/scoring.cpython-310.pyc +0 -0
  32. app/services/__pycache__/scoring.cpython-311.pyc +0 -0
  33. app/services/__pycache__/scraper.cpython-310.pyc +0 -0
  34. app/services/__pycache__/scraper.cpython-311.pyc +0 -0
  35. app/services/analysis_engine.py +425 -0
  36. app/services/hugchat_client.py +54 -0
  37. app/services/llm_generator.py +229 -0
  38. app/services/ml_logic.py +137 -0
  39. app/services/ml_models.py +26 -0
  40. app/services/pdf_processor.py +21 -0
  41. app/services/perplexity_client.py +58 -0
  42. app/services/scoring.py +139 -0
  43. app/services/scraper.py +393 -0
  44. binary_to_report_name_mapping.pkl +3 -0
  45. category_to_greenwashing_mapping.pkl +3 -0
  46. ensemble_model.pkl +3 -0
  47. ml_models/all_feature_columns.pkl +3 -0
  48. ml_models/binary_to_report_name_mapping.pkl +3 -0
  49. ml_models/category_to_greenwashing_mapping.pkl +3 -0
  50. ml_models/ensemble_model.pkl +3 -0
.dockerignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ venv/
6
+ .venv/
7
+
8
+ # Database (created at runtime)
9
+ *.db
10
+
11
+ # Uploads (created at runtime)
12
+ uploads/
13
+
14
+ # Environment files
15
+ .env
16
+ .env.local
17
+
18
+ # IDE
19
+ .vscode/
20
+ .idea/
21
+
22
+ # Logs
23
+ *.log
24
+
25
+ # Git
26
+ .git/
.gitattributes CHANGED
@@ -1,35 +1,4 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
  *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Git LFS for large model files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  *.pkl filter=lfs diff=lfs merge=lfs -text
3
  *.pt filter=lfs diff=lfs merge=lfs -text
4
+ *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces - GreenIntellect Backend API
2
+ # Python FastAPI + ML Models + Scraping
3
+
4
+ FROM python:3.11-slim
5
+
6
+ # Create non-root user (required by Hugging Face)
7
+ RUN useradd -m -u 1000 user
8
+ WORKDIR /app
9
+
10
+ # Install system dependencies for Selenium/Chromium
11
+ RUN apt-get update && apt-get install -y \
12
+ curl \
13
+ wget \
14
+ gnupg \
15
+ chromium \
16
+ chromium-driver \
17
+ fonts-liberation \
18
+ libasound2 \
19
+ libatk-bridge2.0-0 \
20
+ libatk1.0-0 \
21
+ libatspi2.0-0 \
22
+ libcups2 \
23
+ libdbus-1-3 \
24
+ libdrm2 \
25
+ libgbm1 \
26
+ libgtk-3-0 \
27
+ libnspr4 \
28
+ libnss3 \
29
+ libwayland-client0 \
30
+ libxcomposite1 \
31
+ libxdamage1 \
32
+ libxfixes3 \
33
+ libxkbcommon0 \
34
+ libxrandr2 \
35
+ xdg-utils \
36
+ && rm -rf /var/lib/apt/lists/*
37
+
38
+ # Set Chrome environment variables
39
+ ENV CHROME_BIN=/usr/bin/chromium
40
+ ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
41
+
42
+ # Copy and install Python dependencies
43
+ COPY requirements.txt /app/requirements.txt
44
+ RUN pip install --no-cache-dir --upgrade pip && \
45
+ pip install --no-cache-dir --timeout=300 -r /app/requirements.txt
46
+
47
+ # Copy ML model files
48
+ COPY ensemble_model.pkl /app/ensemble_model.pkl
49
+ COPY all_feature_columns.pkl /app/all_feature_columns.pkl
50
+ COPY binary_to_report_name_mapping.pkl /app/binary_to_report_name_mapping.pkl
51
+ COPY category_to_greenwashing_mapping.pkl /app/category_to_greenwashing_mapping.pkl
52
+
53
+ # Copy backend application
54
+ COPY app /app/app
55
+ COPY ml_models /app/ml_models
56
+
57
+ # Create directories
58
+ RUN mkdir -p /app/uploads && chown -R user:user /app
59
+
60
+ # Switch to non-root user
61
+ USER user
62
+
63
+ # Environment variables
64
+ ENV PORT=7860
65
+ ENV HOST=0.0.0.0
66
+ ENV PYTHONUNBUFFERED=1
67
+ ENV PYTHONPATH=/app
68
+
69
+ EXPOSE 7860
70
+
71
+ # Start FastAPI
72
+ CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,54 @@
1
  ---
2
- title: Greenintellect
3
- emoji: 😻
4
- colorFrom: blue
5
- colorTo: green
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: GreenIntellect API
3
+ emoji: 🌿
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
  ---
10
 
11
+ # 🌿 GreenIntellect API
12
+
13
+ AI-powered API for analyzing sustainability reports and detecting greenwashing.
14
+
15
+ ## API Endpoints
16
+
17
+ | Endpoint | Method | Description |
18
+ |----------|--------|-------------|
19
+ | `/api/` | GET | API health check |
20
+ | `/api/analyze` | POST | Analyze text for greenwashing |
21
+ | `/api/upload` | POST | Upload PDF for analysis |
22
+ | `/api/requests` | GET | Get analysis requests |
23
+ | `/` | GET | API welcome message |
24
+
25
+ ## Usage
26
+
27
+ ```python
28
+ import requests
29
+
30
+ # Analyze text
31
+ response = requests.post(
32
+ "https://tanxshh-greenintellect.hf.space/api/analyze",
33
+ json={"company_name": "Example Corp", "text": "Our sustainable practices..."}
34
+ )
35
+ print(response.json())
36
+ ```
37
+
38
+ ## Features
39
+
40
+ - 📄 PDF/Text Analysis
41
+ - 🔍 Greenwashing Detection
42
+ - 📊 Sentiment Analysis
43
+ - 🌐 Web Scraping (News & Reviews)
44
+ - 🤖 AI-powered Insights
45
+
46
+ ## Technology
47
+
48
+ - FastAPI + Python 3.11
49
+ - FinBERT & Sentence Transformers
50
+ - Selenium + Chromium for scraping
51
+ - SQLite Database
52
+
53
+ ---
54
+ Built with ❤️ for a sustainable future
all_feature_columns.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f0b1ae01441008b1d591702001ef5da622b49120de397b6aefe19131d2fb9cb
3
+ size 219
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (113 Bytes). View file
 
app/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (179 Bytes). View file
 
app/__pycache__/main.cpython-310.pyc ADDED
Binary file (952 Bytes). View file
 
app/__pycache__/main.cpython-311.pyc ADDED
Binary file (1.53 kB). View file
 
app/api/__pycache__/endpoints.cpython-310.pyc ADDED
Binary file (4.17 kB). View file
 
app/api/__pycache__/endpoints.cpython-311.pyc ADDED
Binary file (25.7 kB). View file
 
app/api/endpoints.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException
2
+ from sqlalchemy.orm import Session
3
+ from typing import List
4
+ import shutil
5
+ import os
6
+ import json
7
+ from datetime import datetime
8
+ import csv
9
+ import io
10
+ import time
11
+ import random
12
+ from ..db.session import get_db
13
+ from ..db.models import Company, AnalysisRequest
14
+ from ..services.analysis_engine import analyze_company
15
+ from ..services.ml_logic import predict_greenwashing_risk
16
+
17
+ router = APIRouter()
18
+
19
+ UPLOAD_DIR = "uploads"
20
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
21
+
22
+ @router.post("/requests")
23
+ async def create_request(
24
+ company_name: str = Form(...),
25
+ file: UploadFile = File(...),
26
+ db: Session = Depends(get_db)
27
+ ):
28
+ # Save file
29
+ file_path = os.path.join(UPLOAD_DIR, file.filename)
30
+ with open(file_path, "wb") as buffer:
31
+ shutil.copyfileobj(file.file, buffer)
32
+
33
+ # Create Request Record (Pending)
34
+ db_request = AnalysisRequest(
35
+ user_id="demo-user", # Replace with auth
36
+ company_name=company_name,
37
+ document_name=file.filename,
38
+ document_content=file_path, # Store path temporarily or extract text later
39
+ status="pending"
40
+ )
41
+ db.add(db_request)
42
+ db.commit()
43
+ db.refresh(db_request)
44
+
45
+ return db_request
46
+
47
+ @router.post("/requests/{id}/approve")
48
+ async def approve_request(id: int, db: Session = Depends(get_db)):
49
+ db_request = db.query(AnalysisRequest).filter(AnalysisRequest.id == id).first()
50
+ if not db_request:
51
+ raise HTTPException(status_code=404, detail="Request not found")
52
+
53
+ if db_request.status != "pending":
54
+ raise HTTPException(status_code=400, detail="Request already processed")
55
+
56
+ try:
57
+ # Update status
58
+ db_request.status = "processing"
59
+ db.commit()
60
+
61
+ # Run Analysis
62
+ # Note: document_content currently holds the file path from create_request
63
+ file_path = db_request.document_content
64
+ result = await analyze_company(db_request.company_name, file_path)
65
+
66
+ # Update Request
67
+ db_request.status = "completed"
68
+ db_request.analysis_result = result
69
+
70
+ # Update or Create Company Record
71
+ company = db.query(Company).filter(Company.name == db_request.company_name).first()
72
+ if not company:
73
+ company = Company(name=db_request.company_name)
74
+ db.add(company)
75
+
76
+ company.analysis_result = result
77
+ company.last_analysis_date = datetime.now()
78
+
79
+ db.commit()
80
+
81
+ return result
82
+
83
+ except Exception as e:
84
+ db_request.status = "failed"
85
+ db_request.rejection_reason = str(e)
86
+ db.commit()
87
+ raise HTTPException(status_code=500, detail=str(e))
88
+
89
+ @router.post("/requests/{id}/reject")
90
+ def reject_request(id: int, reason: str = Form(...), db: Session = Depends(get_db)):
91
+ db_request = db.query(AnalysisRequest).filter(AnalysisRequest.id == id).first()
92
+ if not db_request:
93
+ raise HTTPException(status_code=404, detail="Request not found")
94
+
95
+ # Delete the request
96
+ db.delete(db_request)
97
+ db.commit()
98
+ return {"message": f"Request for {db_request.company_name} rejected and deleted", "reason": reason}
99
+
100
+ @router.get("/requests")
101
+ def get_requests(db: Session = Depends(get_db)):
102
+ return db.query(AnalysisRequest).all()
103
+
104
+ @router.get("/companies")
105
+ def get_companies(db: Session = Depends(get_db)):
106
+ return db.query(Company).all()
107
+
108
+ @router.post("/companies/bulk")
109
+ def bulk_import_companies(companies: List[dict], db: Session = Depends(get_db)):
110
+ """Bulk import companies from CSV or other sources"""
111
+ imported = []
112
+ for company_data in companies:
113
+ # Check if company already exists
114
+ existing = db.query(Company).filter(Company.name == company_data.get("name")).first()
115
+ if existing:
116
+ # Update existing
117
+ existing.analysis_result = company_data.get("analysis")
118
+ existing.last_analysis_date = datetime.now()
119
+ existing.description = company_data.get("description", existing.description)
120
+ existing.website = company_data.get("website", existing.website)
121
+ imported.append(existing)
122
+ else:
123
+ # Create new
124
+ new_company = Company(
125
+ name=company_data.get("name"),
126
+ description=company_data.get("description", ""),
127
+ website=company_data.get("website", ""),
128
+ analysis_result=company_data.get("analysis"),
129
+ last_analysis_date=datetime.now()
130
+ )
131
+ db.add(new_company)
132
+ imported.append(new_company)
133
+
134
+ db.commit()
135
+ return {"imported": len(imported), "companies": [c.name for c in imported]}
136
+
137
+ @router.get("/company/{id}")
138
+ def get_company(id: int, db: Session = Depends(get_db)):
139
+ return db.query(Company).filter(Company.id == id).first()
140
+
141
+ @router.delete("/companies/all")
142
+ def delete_all_companies(db: Session = Depends(get_db)):
143
+ """Delete all companies from the database"""
144
+ count = db.query(Company).delete()
145
+ db.commit()
146
+ return {"message": f"Deleted {count} companies"}
147
+
148
+ @router.delete("/company/{id}")
149
+ def delete_company(id: int, db: Session = Depends(get_db)):
150
+ """Delete a specific company by ID"""
151
+ company = db.query(Company).filter(Company.id == id).first()
152
+ if not company:
153
+ raise HTTPException(status_code=404, detail="Company not found")
154
+
155
+ db.delete(company)
156
+ db.commit()
157
+ return {"message": f"Deleted company {company.name}"}
158
+
159
+ @router.delete("/requests/cleanup")
160
+ def cleanup_requests(db: Session = Depends(get_db)):
161
+ """Delete requests that are completed, rejected, or failed"""
162
+ count = db.query(AnalysisRequest).filter(
163
+ AnalysisRequest.status.in_(["completed", "rejected", "failed"])
164
+ ).delete(synchronize_session=False)
165
+ db.commit()
166
+ return {"message": f"Cleaned up {count} processed requests"}
167
+
168
+ @router.delete("/request/{id}")
169
+ def delete_request(id: int, db: Session = Depends(get_db)):
170
+ """Force delete a request"""
171
+ req = db.query(AnalysisRequest).filter(AnalysisRequest.id == id).first()
172
+ if not req:
173
+ raise HTTPException(status_code=404, detail="Request not found")
174
+ db.delete(req)
175
+ db.commit()
176
+ return {"message": "Request deleted"}
177
+
178
+ @router.post("/companies/upload-csv")
179
+ async def upload_companies_csv(file: UploadFile = File(...), db: Session = Depends(get_db)):
180
+ """
181
+ Upload CSV for live greenwashing analysis with BATCH AI processing.
182
+ """
183
+ if not file.filename.endswith('.csv'):
184
+ raise HTTPException(status_code=400, detail="Invalid file type. Please upload a CSV.")
185
+
186
+ content = await file.read()
187
+ decoded = content.decode('utf-8-sig')
188
+ csv_reader = csv.DictReader(io.StringIO(decoded))
189
+
190
+ if csv_reader.fieldnames:
191
+ csv_reader.fieldnames = [f.strip().lower() for f in csv_reader.fieldnames]
192
+
193
+ print(f"[DEBUG] CSV Headers found: {csv_reader.fieldnames}")
194
+
195
+ results = []
196
+ gemini_batch = []
197
+ batch_size = 10
198
+
199
+ from app.services.perplexity_client import research_company, PERPLEXITY_API_KEY
200
+ from app.services.llm_generator import generate_batch_insights
201
+
202
+ # Import scoring utilities if not already imported (better to move to top, but here for context)
203
+ from app.services.scoring import analyze_sentiment, calculate_vague_score, calculate_concrete_score
204
+ import re
205
+
206
+ # Helper for counting keywords
207
+ def count_keywords(text: str, keywords: list) -> int:
208
+ count = 0
209
+ text_lower = text.lower()
210
+ for k in keywords:
211
+ count += len(re.findall(r'\b' + re.escape(k) + r'\b', text_lower))
212
+ return count
213
+
214
+ # Keyword lists (reused from analysis_engine concept)
215
+ GREEN_KEYWORDS = ['sustainable', 'eco-friendly', 'green', 'carbon neutral', 'net zero', 'renewable', 'biodegradable']
216
+ EMISSION_KEYWORDS = ['emission', 'co2', 'carbon']
217
+ ENERGY_KEYWORDS = ['energy', 'solar', 'wind', 'power']
218
+ WASTE_KEYWORDS = ['waste', 'recycling', 'plastic']
219
+
220
+ gemini_batch = []
221
+ batch_size = 10
222
+
223
+ def process_batch_and_save(batch_items):
224
+ if not batch_items: return
225
+
226
+ # Split batch into AI-needed and Fast-Path
227
+ ai_needed_items = [item for item in batch_items if not item.get('skip_ai')]
228
+ fast_path_items = [item for item in batch_items if item.get('skip_ai')]
229
+
230
+ batch_insights = {}
231
+
232
+ # 1. Generate AI Insights ONLY for needed items
233
+ if ai_needed_items:
234
+ ai_inputs = [{"name": item['name'], "context": item['context']} for item in ai_needed_items]
235
+ print(f"Processing batch of {len(ai_inputs)} companies via AI Service...")
236
+
237
+ # Add small delay only if calling AI
238
+ if len(ai_inputs) > 0:
239
+ time.sleep(2)
240
+
241
+ batch_insights = generate_batch_insights(ai_inputs)
242
+
243
+ # 2. Merge and Save (Process both lists)
244
+ for item in batch_items:
245
+ name = item['name']
246
+
247
+ if item.get('skip_ai'):
248
+ # Fast Path Defaults
249
+ desc = item.get('text')[:500] if item.get('text') else "Imported via CSV (Manual Assessment)"
250
+ recs = ["Maintain current transparency"] if item['gw_label'] == 0 else ["Improve data disclosure"]
251
+ else:
252
+ # AI Results
253
+ insights = batch_insights.get(name, {})
254
+ desc = insights.get("description", "AI description pending or unavailable.")
255
+ recs = insights.get("recommendations", {})
256
+
257
+ # Construct Final Result
258
+ analysis_result = {
259
+ "company_name": name,
260
+ "company_description": desc,
261
+ "last_updated": datetime.now().isoformat(),
262
+ "confidence_score": f"{item['prediction']['details'].get('confidence', 'N/A')}% (AI)" if not item.get('skip_ai') else "100% (Manual)",
263
+ "greenwashingLabel": item['gw_label'],
264
+ "internal_documents_analysis": {
265
+ "major_findings": [
266
+ f"Risk Level: {item['final_label_str']}",
267
+ f"Reason: {item['reasoning_text']}"
268
+ ],
269
+ "compliance_risks": [item['reasoning_text']] if item['gw_label'] == 1 else []
270
+ },
271
+ "reviews_analysis": {
272
+ "employee_tone": "N/A",
273
+ "customer_tone": "N/A",
274
+ "common_issues": [],
275
+ "overall_sentiment_score": f"{int(item['features_dict']['overall_sentiment_score'] * 100)}/100"
276
+ },
277
+ "recommended_actions": recs,
278
+ "external_summary": {
279
+ "key_highlights": [f"External Sentiment Gap: {item['features_dict']['external_sentiment_gap']}"],
280
+ "public_sentiment": "Mixed" if item['features_dict']['external_sentiment_gap'] > 0.1 else "Positive",
281
+ "recent_news_summary": item['reasoning_text'],
282
+ "possible_bias": "None",
283
+ },
284
+ "risk_assessment": {
285
+ "financial_risk": "High" if item['final_label_str'] == "Greenwashing" else "Low",
286
+ "reputation_risk": "Critical" if item['final_label_str'] == "Greenwashing" else ("Medium" if item['final_label_str'] == "At Risk" else "Low"),
287
+ "compliance_risk": "Medium",
288
+ "market_risk": "Low",
289
+ "overall_risk_level": item['final_label_str']
290
+ },
291
+ "final_company_score": {
292
+ "rating_out_of_100": int(item['features_dict']['overall_sentiment_score'] * 100) if item['features_dict']['overall_sentiment_score'] <= 1 else int(item['features_dict']['overall_sentiment_score']),
293
+ "label": item['prediction']['model_label']
294
+ },
295
+ "detailed_scores": item['features_dict'],
296
+ "generated_summary": f"Classified as {item['prediction']['model_label']}"
297
+ }
298
+
299
+ results.append({"name": name, "label": item['gw_label'], "status": f"Processed ({item['final_label_str']})"})
300
+
301
+ # DB Save
302
+ existing = db.query(Company).filter(Company.name == name).first()
303
+ if existing:
304
+ existing.analysis_result = analysis_result
305
+ existing.last_analysis_date = datetime.now()
306
+ else:
307
+ new_company = Company(
308
+ name=name,
309
+ description=desc,
310
+ analysis_result=analysis_result,
311
+ last_analysis_date=datetime.now()
312
+ )
313
+ db.add(new_company)
314
+ db.commit()
315
+
316
+ for row in csv_reader:
317
+ # Flexible column names (normalized)
318
+ name = row.get('company_name') or row.get('company') or row.get('name')
319
+ text = row.get('description') or row.get('text') or row.get('claims') or ""
320
+
321
+ if not name:
322
+ continue
323
+
324
+ # --- FEATURE CALCULATION (If columns missing) ---
325
+ # 1. Base Sentiment
326
+ sentiment_res = analyze_sentiment([text] if text else [])
327
+ overall_sentiment = sentiment_res['score']
328
+
329
+ # 2. Keyword Stats
330
+ green_freq = float(row.get('green keyword frequecy') or row.get('green keyword frequency') or count_keywords(text, GREEN_KEYWORDS))
331
+
332
+ # 3. Vague/Concrete Scores (Using simple heuristic or scoring func)
333
+ # Assuming scoring.py has these, if not, fallback to simple version:
334
+ try:
335
+ # Basic sentence splitting
336
+ sentences = [s.strip() for s in text.split('.') if s.strip()]
337
+ vague_ratio = float(row.get('vague keyword ratio') or calculate_vague_score(sentences))
338
+ concrete_ratio = float(row.get('concrete cailm ratio') or row.get('concrete claim ratio') or calculate_concrete_score(sentences))
339
+ except:
340
+ vague_ratio = 0.2
341
+ concrete_ratio = 0.3
342
+
343
+ # 4. Aspect Sentiments (Fallback to overall if specific not found)
344
+ emission_sent = float(row.get('emission sentiment ') or row.get('emission sentiment') or overall_sentiment)
345
+ energy_sent = float(row.get('energy sentiment') or overall_sentiment)
346
+ waste_sent = float(row.get('waste sentiment') or overall_sentiment)
347
+
348
+ # EXTRACT FEATURES FOR MODEL (AND FRONTEND DISPLAY)
349
+ # Naming Verification:
350
+ # Frontend (Analytics.tsx) expects:
351
+ # - green_keyword_frequency
352
+ # - vague_keyword_ratio
353
+ # - concrete_claim_ratio
354
+ # - external_sentiment_gap
355
+ # - emission_sentiment
356
+ # - energy_sentiment
357
+ # - waste_sentiment
358
+ # - relative_focus_score
359
+
360
+ features_dict = {
361
+ 'green_keyword_frequency': green_freq,
362
+ 'vague_keyword_ratio': vague_ratio,
363
+ 'concrete_claim_ratio': concrete_ratio,
364
+ 'overall_sentiment_score': overall_sentiment,
365
+ 'external_sentiment_gap': float(row.get('external_sentiment_gap') or 0.4),
366
+ 'emission_sentiment': emission_sent,
367
+ 'energy_sentiment': energy_sent,
368
+ 'waste_sentiment': waste_sent,
369
+ 'relative_focus_score': float(row.get('relative focus score') or 0.5)
370
+ }
371
+
372
+ gw_label_raw = row.get('greenwashing_label') or row.get('greenwashing label') or row.get('category')
373
+ skip_ai = False
374
+
375
+ if gw_label_raw:
376
+ # Manual label from CSV - TRUST IT (No AI)
377
+ skip_ai = True
378
+ final_label_str = str(gw_label_raw).strip()
379
+ if final_label_str.lower() in ['greenwashing', 'high', 'critical', '1']:
380
+ final_label_str = "Greenwashing"; gw_label = 1
381
+ elif final_label_str.lower() in ['medium', 'at risk']:
382
+ final_label_str = "At Risk"; gw_label = 1
383
+ else:
384
+ final_label_str = "No Risk"; gw_label = 0
385
+
386
+ reasoning_text = f"Classified as {final_label_str} based on historical CSV data."
387
+
388
+ # Initialize dummy prediction for compatibility
389
+ prediction = {
390
+ 'risk_label': final_label_str,
391
+ 'greenwashing_risk': gw_label,
392
+ 'details': {'confidence': 100},
393
+ 'model_label': final_label_str
394
+ }
395
+ else:
396
+ # AI/Model Prediction (Fallback only if no label)
397
+ prediction = predict_greenwashing_risk(text, company_name=name, features_dict=features_dict)
398
+
399
+ final_label_str = prediction['risk_label']
400
+ # Map old AI outputs to new strings just in case
401
+ if final_label_str == "High" or final_label_str == "Critical": final_label_str = "Greenwashing"
402
+ elif final_label_str == "Medium": final_label_str = "At Risk"
403
+ elif final_label_str == "Low": final_label_str = "No Risk"
404
+
405
+ gw_label = 1 if final_label_str in ["Greenwashing", "At Risk"] else 0
406
+ reasoning_text = f"AI Analysis: Classified as {final_label_str} based on pattern matching."
407
+
408
+ # --- HEURISTIC OVERRIDE (Forcing Sensitivity) ---
409
+ # If Vague > 0.50 AND not enough concrete data to justify it (>10%)
410
+ if vague_ratio > 0.50 and concrete_ratio < 0.10:
411
+ final_label_str = "Greenwashing"
412
+ gw_label = 1
413
+ reasoning_text = "Risk High: Excessive vague language without supporting concrete data."
414
+ elif concrete_ratio < 0.01 and overall_sentiment > 0.6:
415
+ final_label_str = "Greenwashing"
416
+ gw_label = 1
417
+ reasoning_text = "Greenwashing Alert: Positive claims lack concrete evidence."
418
+
419
+ # PERPLEXITY CHECK (Instant Processing for Paid API)
420
+ pplx_success = False
421
+ if PERPLEXITY_API_KEY and not skip_ai:
422
+ pplx_data = research_company(name)
423
+ if pplx_data:
424
+ pplx_success = True
425
+ # If Perplexity worked, save immediately and skip batch
426
+ # Construct partial item to reuse logic or save directly?
427
+ # Saving directly is safer to avoid mixups.
428
+ desc = pplx_data.get("description", "AI unavailable")
429
+ recs = pplx_data.get("recommendations", {})
430
+ if "Controversy" in str(pplx_data.get("findings")): gw_label = 1 # Update risk
431
+
432
+ # ... (Reuse Construction Logic?) ...
433
+ # For brevity, I will just add it to a "processed_item" and call save single?
434
+ # Actually, let's just make a fake batch of 1 and reuse the save logic but pass pre-filled data?
435
+ # Complexity: High.
436
+
437
+ # Simplification: Treat Perplexity result as "batch insights" result for a batch of 1.
438
+ # Mock batch_insights structure
439
+ # Call save logic manually or refactor `process_batch_and_save` to accept external insights?
440
+
441
+ # Plan: Construct `item` manually, adding 'pplx_insights' key. Update `process_batch` to check for it.
442
+ pass
443
+
444
+ # Prepare Context
445
+ context = f"""
446
+ Greenwashing Risk: {final_label_str}
447
+ Reason: {reasoning_text}
448
+ Sentiment: {features_dict['overall_sentiment_score']:.2f}
449
+ """
450
+
451
+ item_data = {
452
+ "name": name,
453
+ "text": text,
454
+ "context": context,
455
+ "prediction": prediction,
456
+ "features_dict": features_dict,
457
+ "gw_label": gw_label,
458
+ "final_label_str": final_label_str,
459
+ "reasoning_text": reasoning_text,
460
+ "skip_ai": skip_ai
461
+ }
462
+
463
+ # Queue for Batch
464
+ gemini_batch.append(item_data)
465
+
466
+ if len(gemini_batch) >= batch_size:
467
+ process_batch_and_save(gemini_batch)
468
+ gemini_batch = []
469
+
470
+ # Final batch
471
+ if gemini_batch:
472
+ process_batch_and_save(gemini_batch)
473
+
474
+ return {
475
+ "message": f"Processed {len(results)} companies using Batch AI Analysis.",
476
+ "predictions": results
477
+ }
app/db/__pycache__/models.cpython-310.pyc ADDED
Binary file (1.48 kB). View file
 
app/db/__pycache__/models.cpython-311.pyc ADDED
Binary file (2.47 kB). View file
 
app/db/__pycache__/session.cpython-310.pyc ADDED
Binary file (639 Bytes). View file
 
app/db/__pycache__/session.cpython-311.pyc ADDED
Binary file (1.03 kB). View file
 
app/db/models.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, Text, JSON, DateTime, ForeignKey, Float
2
+ from sqlalchemy.orm import relationship
3
+ from datetime import datetime
4
+ from .session import Base
5
+
6
+ class Company(Base):
7
+ __tablename__ = "companies"
8
+
9
+ id = Column(Integer, primary_key=True, index=True)
10
+ name = Column(String, unique=True, index=True)
11
+ description = Column(Text, nullable=True)
12
+ website = Column(String, nullable=True)
13
+ last_analysis_date = Column(DateTime, default=datetime.utcnow)
14
+
15
+ # JSON blobs for structured analysis data
16
+ analysis_result = Column(JSON, nullable=True)
17
+
18
+ requests = relationship("AnalysisRequest", back_populates="company")
19
+
20
+ class AnalysisRequest(Base):
21
+ __tablename__ = "requests"
22
+
23
+ id = Column(Integer, primary_key=True, index=True)
24
+ user_id = Column(String, index=True) # Linking to frontend user ID
25
+ company_name = Column(String)
26
+ website = Column(String, nullable=True)
27
+ document_name = Column(String, nullable=True)
28
+ document_content = Column(Text, nullable=True) # Extracted text from PDF
29
+
30
+ status = Column(String, default="pending") # pending, processing, completed, failed
31
+ submission_date = Column(DateTime, default=datetime.utcnow)
32
+
33
+ analysis_result = Column(JSON, nullable=True)
34
+ rejection_reason = Column(String, nullable=True)
35
+
36
+ company_id = Column(Integer, ForeignKey("companies.id"), nullable=True)
37
+ company = relationship("Company", back_populates="requests")
app/db/session.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import create_engine
2
+ from sqlalchemy.ext.declarative import declarative_base
3
+ from sqlalchemy.orm import sessionmaker
4
+
5
+ SQLALCHEMY_DATABASE_URL = "sqlite:///./greenintellect.db"
6
+ # SQLALCHEMY_DATABASE_URL = "postgresql://user:password@postgresserver/db"
7
+
8
+ engine = create_engine(
9
+ SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
10
+ )
11
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
12
+
13
+ Base = declarative_base()
14
+
15
+ def get_db():
16
+ db = SessionLocal()
17
+ try:
18
+ yield db
19
+ finally:
20
+ db.close()
app/main.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from dotenv import load_dotenv
4
+ from .api import endpoints
5
+ from .db.session import engine, Base
6
+
7
+ load_dotenv()
8
+
9
+ # Create Tables
10
+
11
+ # Create Tables
12
+ Base.metadata.create_all(bind=engine)
13
+
14
+ app = FastAPI(title="Green Intellect API", version="1.0.0")
15
+
16
+ # CORS
17
+ app.add_middleware(
18
+ CORSMiddleware,
19
+ allow_origins=["*"],
20
+ allow_credentials=True,
21
+ allow_methods=["*"],
22
+ allow_headers=["*"],
23
+ )
24
+
25
+ app.include_router(endpoints.router, prefix="/api")
26
+
27
+ @app.get("/")
28
+ def read_root():
29
+ return {"message": "Welcome to Green Intellect API"}
30
+
31
+ if __name__ == "__main__":
32
+ import uvicorn
33
+ uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
app/services/__pycache__/analysis_engine.cpython-310.pyc ADDED
Binary file (7.91 kB). View file
 
app/services/__pycache__/analysis_engine.cpython-311.pyc ADDED
Binary file (19.6 kB). View file
 
app/services/__pycache__/hugchat_client.cpython-311.pyc ADDED
Binary file (2.36 kB). View file
 
app/services/__pycache__/llm_generator.cpython-311.pyc ADDED
Binary file (11.3 kB). View file
 
app/services/__pycache__/ml_logic.cpython-311.pyc ADDED
Binary file (6.12 kB). View file
 
app/services/__pycache__/ml_models.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
app/services/__pycache__/ml_models.cpython-311.pyc ADDED
Binary file (1.8 kB). View file
 
app/services/__pycache__/pdf_processor.cpython-310.pyc ADDED
Binary file (887 Bytes). View file
 
app/services/__pycache__/pdf_processor.cpython-311.pyc ADDED
Binary file (1.62 kB). View file
 
app/services/__pycache__/perplexity_client.cpython-311.pyc ADDED
Binary file (2.87 kB). View file
 
app/services/__pycache__/scoring.cpython-310.pyc ADDED
Binary file (3.67 kB). View file
 
app/services/__pycache__/scoring.cpython-311.pyc ADDED
Binary file (7.65 kB). View file
 
app/services/__pycache__/scraper.cpython-310.pyc ADDED
Binary file (4.39 kB). View file
 
app/services/__pycache__/scraper.cpython-311.pyc ADDED
Binary file (18.4 kB). View file
 
app/services/analysis_engine.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from .pdf_processor import extract_text_from_pdf, split_sentences, clean_text
3
+ from .scraper import get_company_news, get_company_reviews, report_progress
4
+ from .scoring import calculate_scores, analyze_sentiment, analyze_aspect_sentiment, calculate_vague_score, calculate_concrete_score
5
+ from .llm_generator import generate_company_description, generate_ai_recommendations
6
+
7
+ # Aspect Keywords
8
+ EMISSION_KEYWORDS = ['emission', 'carbon', 'co2', 'greenhouse', 'pollution', 'net zero', 'carbon neutral']
9
+ ENERGY_KEYWORDS = ['energy', 'renewable', 'solar', 'wind', 'electricity', 'fuel', 'power']
10
+ WASTE_KEYWORDS = ['waste', 'recycling', 'plastic', 'circular economy', 'disposal', 'landfill']
11
+
12
+ def detect_contradictions(pdf_text, news_articles):
13
+ """
14
+ Detect contradictions between company claims (PDF) and external reports (news)
15
+ Returns list of contradictions with evidence
16
+ """
17
+ contradictions = []
18
+
19
+ # Keywords that indicate strong claims
20
+ claim_keywords = ['committed', 'achieved', 'reduced', 'eliminated', 'carbon neutral', 'net zero', 'sustainable']
21
+
22
+ # Keywords that indicate environmental context (Strict Physical Terms only)
23
+ # Removed generic words like 'green', 'sustainability', 'environmental' which appear in financial contexts
24
+ env_context = ['climate', 'carbon', 'emission', 'pollution', 'waste', 'biodiversity', 'fossil fuel', 'deforestation', 'ecological']
25
+
26
+ # Exclude regulators to avoid flagging financial fines as greenwashing
27
+ # (RBI, SEBI, SEC, etc.)
28
+ financial_exclusions = ['rbi', 'sebi', 'sec', 'money laundering', 'insider trading', 'stock market', 'shares', 'quarterly result']
29
+
30
+ for article in news_articles:
31
+ # Check if article is relevant to environment before counting it as a contradiction
32
+ text = (article['title'] + " " + article['content']).lower()
33
+
34
+ # Safety Check: If it mentions financial regulators/crimes, IGNORE even if it says "Green"
35
+ if any(ex in text for ex in financial_exclusions):
36
+ continue
37
+
38
+ if not any(k in text for k in env_context):
39
+ continue
40
+
41
+ for key in claim_keywords:
42
+ if key in text and any(neg in text for neg in ['false', 'misleading', 'investigation', 'lawsuit', 'fine', 'violation']):
43
+ contradictions.append({
44
+ "claim_type": "Environmental claim questioned",
45
+ "evidence": article['title'],
46
+ "source": article['url'],
47
+ "risk_level": "High"
48
+ })
49
+ break
50
+
51
+ # Keywords that indicate skepticism or allegations
52
+ skeptic_keywords = ['greenwashing', 'false claims', 'misleading', 'controversy', 'lawsuit', 'allegations']
53
+
54
+ pdf_lower = pdf_text.lower()
55
+ has_strong_claims = any(keyword in pdf_lower for keyword in claim_keywords)
56
+
57
+ if has_strong_claims:
58
+ for article in news_articles:
59
+ content_lower = article['content'].lower()
60
+ if any(keyword in content_lower for keyword in skeptic_keywords):
61
+ contradictions.append({
62
+ "claim_type": "Environmental commitment",
63
+ "evidence_url": article['url'],
64
+ "evidence_title": article['title'],
65
+ "severity": "High"
66
+ })
67
+
68
+ # New: General Compliance Risk Detection (Not just contradictions)
69
+ # Search for specific legal/compliance keywords in all articles
70
+ compliance_keywords = ['lawsuit', 'fine', 'penalty', 'violation', 'non-compliance', 'EPA', 'investigation', 'fraud', 'illegal']
71
+ for article in news_articles:
72
+ content_lower = article['content'].lower()
73
+ if any(keyword in content_lower for keyword in compliance_keywords):
74
+ contradictions.append({ # Leveraging the same list for now, or could create a separate list
75
+ "claim_type": "Regulatory Compliance Issue",
76
+ "evidence_url": article['url'],
77
+ "evidence_title": article['title'],
78
+ "severity": "Critical"
79
+ })
80
+
81
+ return contradictions
82
+
83
+ def detect_hidden_patterns(all_reviews):
84
+ """
85
+ Analyze reviews to find hidden patterns:
86
+ - Sudden changes in sentiment
87
+ - Repeated phrases (astroturfing)
88
+ - Discrepancies between employee and customer reviews
89
+ """
90
+ patterns = []
91
+
92
+ if len(all_reviews) > 10:
93
+ # Check for repeated phrases (potential fake reviews)
94
+ content_texts = [r['content'][:500] for r in all_reviews]
95
+ unique_ratio = len(set(content_texts)) / len(content_texts)
96
+
97
+ if unique_ratio < 0.7:
98
+ patterns.append({
99
+ "pattern": "Potential astroturfing detected",
100
+ "description": f"Only {int(unique_ratio*100)}% unique review content - may indicate coordinated posting",
101
+ "severity": "Medium"
102
+ })
103
+
104
+ # Check for platform discrepancies
105
+ glassdoor_reviews = [r for r in all_reviews if 'glassdoor' in r['url'].lower()]
106
+ reddit_reviews = [r for r in all_reviews if 'reddit' in r['url'].lower()]
107
+
108
+ if glassdoor_reviews and reddit_reviews:
109
+ patterns.append({
110
+ "pattern": "Multi-platform analysis available",
111
+ "description": f"Found {len(glassdoor_reviews)} Glassdoor and {len(reddit_reviews)} Reddit discussions for cross-validation",
112
+ "severity": "Info"
113
+ })
114
+
115
+ return patterns
116
+
117
+ async def analyze_company(company_name: str, pdf_path: str):
118
+ report_progress(f"Starting comprehensive analysis for {company_name}", 5)
119
+
120
+ # 1. Process PDF
121
+ report_progress("Processing PDF document...", 8)
122
+ pdf_text = extract_text_from_pdf(pdf_path)
123
+ pdf_sentences = split_sentences(pdf_text)
124
+
125
+ # --- PERPLEXITY AI INTEGRATION ---
126
+ from .perplexity_client import research_company, PERPLEXITY_API_KEY
127
+ pplx_data = None
128
+
129
+ if PERPLEXITY_API_KEY:
130
+ report_progress("Conducting deep research...", 15)
131
+ pplx_data = research_company(company_name)
132
+
133
+ # 2. Comprehensive Scraping (ALL available sources)
134
+ # Always run scraping to get real news, even if Perplexity is active
135
+ news_articles = await get_company_news(company_name)
136
+
137
+ # Optional: We can still use Perplexity findings for internal scoring without displaying them as 'news'
138
+ if pplx_data:
139
+ pass # Findings already in pplx_data for later use
140
+
141
+ # Progress 50-80% handled by get_company_reviews
142
+ reviews = await get_company_reviews(company_name)
143
+
144
+ # Progress 50-80% handled by get_company_reviews
145
+ reviews = await get_company_reviews(company_name)
146
+
147
+ # 3. Analyze PDF Content
148
+ report_progress("Analyzing PDF content...", 82)
149
+ pdf_scores = calculate_scores(pdf_sentences)
150
+
151
+ # 4. Detect Contradictions and Hidden Patterns
152
+ report_progress("Detecting contradictions and patterns...", 85)
153
+ contradictions = detect_contradictions(pdf_text, news_articles)
154
+ hidden_patterns = detect_hidden_patterns(reviews)
155
+
156
+ # 5. Analyze External Sentiment with ALL data
157
+ report_progress("Analyzing sentiment...", 90)
158
+ news_text = [a['content'] for a in news_articles]
159
+ reviews_text = [r['content'] for r in reviews]
160
+ all_external_text = news_text + reviews_text
161
+
162
+ news_sentiment = analyze_sentiment(news_text) if news_text else {'label': 'Neutral', 'score': 0.5}
163
+ reviews_sentiment = analyze_sentiment(reviews_text) if reviews_text else {'label': 'Neutral', 'score': 0.5}
164
+
165
+ # Aspect-based sentiment (REAL SCORES)
166
+ emission_sentiment = analyze_aspect_sentiment(all_external_text, EMISSION_KEYWORDS)
167
+ energy_sentiment = analyze_aspect_sentiment(all_external_text, ENERGY_KEYWORDS)
168
+ waste_sentiment = analyze_aspect_sentiment(all_external_text, WASTE_KEYWORDS)
169
+
170
+ # 6. Calculate Evidence-Based Score with detailed metrics
171
+ report_progress("Calculating final scores...", 95)
172
+
173
+ # Calculate detailed scores (REAL METRICS)
174
+ green_keyword_freq = pdf_scores['env_count'] / max(len(pdf_sentences), 1)
175
+ vague_ratio = calculate_vague_score(pdf_sentences)
176
+ concrete_ratio = calculate_concrete_score(pdf_sentences)
177
+
178
+ # --- IMPROVED SCORING FORMULA ---
179
+ # We now calculate the composite sentiment FIRST and let it drive the external portion of the score.
180
+ # See lines 340+ for where we normally calculated it. We'll do it here to affect the score.
181
+
182
+ # 1. Internal Sentiment
183
+ internal_sentiment_data = analyze_sentiment(pdf_scores['env_sentences'])
184
+
185
+ def get_linear_score_local(s_dict):
186
+ # Convert label+confidence to 0-100 scale
187
+ if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) # 50-100
188
+ if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) # 0-50
189
+ return 50 # Neutral
190
+
191
+ s_int = get_linear_score_local(internal_sentiment_data)
192
+ s_ext = get_linear_score_local(news_sentiment)
193
+ s_rev = get_linear_score_local(reviews_sentiment)
194
+
195
+ # 2. Composite Sentiment Score (0-100)
196
+ # 35% Internal (What they say) + 45% External (News) + 20% Reviews (Employee/Public)
197
+ composite_score_val = (s_int * 0.35) + (s_ext * 0.45) + (s_rev * 0.20)
198
+
199
+ # 3. Base Score Calculation
200
+ # We blend the Composite Sentiment (Qualitative) with Concrete Data (Quantitative)
201
+
202
+ # Start with the Sentiment Score (0-100)
203
+ final_score = composite_score_val
204
+
205
+ # Adjust based on Concrete Data (The "Proof")
206
+ # If they have high concrete data, boost the score.
207
+ # If they have high vague language, penalize the score.
208
+
209
+ score_modifier = 0
210
+ score_modifier += min(concrete_ratio * 100, 25) # Up to +25 points for concrete data
211
+ score_modifier -= min(vague_ratio * 50, 20) # Up to -20 points for vague language
212
+
213
+ # Apply modifier
214
+ final_score += score_modifier
215
+
216
+ # Contradiction Penalty (Facts Check)
217
+ if contradictions:
218
+ # Heavily penalize for contradictions
219
+ final_score -= (len(contradictions) * 15)
220
+
221
+ # Cap at 0-100
222
+ final_score = max(0, min(100, final_score))
223
+
224
+
225
+ # Calculate external sentiment gap
226
+ ext_gap = abs(news_sentiment['score'] - reviews_sentiment['score'])
227
+ # Determine label
228
+ if final_score >= 80: label = "Excellent"
229
+ elif final_score >= 60: label = "Good"
230
+ elif final_score >= 40: label = "Average"
231
+ elif final_score >= 20: label = "At Risk"
232
+ else: label = "Greenwashing"
233
+
234
+ # Determine risk level (3-State System)
235
+ # 2 = Greenwashing (High/Critical)
236
+ # 1 = At Risk (Medium)
237
+ # 0 = No Risk (Low)
238
+ risk_level_code = 0
239
+ risk_reasons = []
240
+
241
+ # 1. Contradictions (Immediate Greenwashing)
242
+ if contradictions:
243
+ risk_level_code = 2
244
+ risk_reasons.append("External contradictions found")
245
+
246
+ # 2. Score Thresholds
247
+ if final_score < 40:
248
+ risk_level_code = max(risk_level_code, 2)
249
+ risk_reasons.append(f"Critical Sustainability Score ({int(final_score)}/100)")
250
+ elif final_score < 60:
251
+ risk_level_code = max(risk_level_code, 1) # At Risk
252
+
253
+ # 3. Vague Language
254
+ if vague_ratio > 0.50 and concrete_ratio < 0.10:
255
+ risk_level_code = 2
256
+ risk_reasons.append("Excessive vague language")
257
+ elif vague_ratio > 0.40 and concrete_ratio < 0.20:
258
+ risk_level_code = max(risk_level_code, 1) # At Risk
259
+
260
+ # 4. Empty Claims
261
+ if news_sentiment['label'] == 'Positive' and concrete_ratio < 0.01:
262
+ risk_level_code = 2
263
+ risk_reasons.append("Positive press without concrete data")
264
+
265
+ # --- SAFE HARBOR OVERRIDE ---
266
+ high_risk_industries = ['coal', 'oil', 'petroleum', 'mining', 'gas', 'cement', 'steel', 'tobacco', 'power', 'thermal', 'adani']
267
+ is_high_risk = any(ind in company_name.lower() for ind in high_risk_industries)
268
+
269
+ pass_safe_harbor = False
270
+ if concrete_ratio > 0.05 and len(contradictions) < 2:
271
+ if is_high_risk:
272
+ if concrete_ratio > 0.20 and emission_sentiment['label'] == 'Positive':
273
+ pass_safe_harbor = True
274
+ else:
275
+ if risk_level_code < 2:
276
+ risk_level_code = 2
277
+ risk_reasons.append("High Risk Industry without exceptional mitigation")
278
+ elif emission_sentiment['label'] != 'Negative':
279
+ pass_safe_harbor = True
280
+
281
+ if pass_safe_harbor:
282
+ risk_level_code = 0 # Force No Risk
283
+ if risk_reasons:
284
+ risk_reasons = [f"Risk Mitigated: Sufficient concrete data ({round(concrete_ratio*100, 1)}%) provided."]
285
+ print(f"SAFE HARBOR TRIGGERED for {company_name}")
286
+
287
+ # Map code to string
288
+ # IMPACT: User requested specific labels
289
+ if risk_level_code == 2:
290
+ overall_risk_str = "Greenwashing"
291
+ greenwashing_flag = 1
292
+ elif risk_level_code == 1:
293
+ overall_risk_str = "At Risk"
294
+ greenwashing_flag = 0 # It's not "Greenwashing" per se, just risky? Or should flag be 1?
295
+ # Typically "Greenwashing" flag is binary for UI warnings, but let's keep it 1 only for High.
296
+ else:
297
+ overall_risk_str = "No Risk"
298
+ greenwashing_flag = 0
299
+
300
+ # Update reasons into result
301
+ if risk_reasons and risk_level_code >= 1:
302
+ pdf_scores['env_sentences'] = [f"[RISK] {r}" for r in risk_reasons] + pdf_scores['env_sentences']
303
+
304
+ # --- AI RECOMMENDATIONS & DESCRIPTION GENERATION ---
305
+ company_description = ""
306
+ ai_recommendations = {}
307
+
308
+ if pplx_data:
309
+ report_progress("Using insights...", 95)
310
+ company_description = pplx_data.get("description", "Description unavailable.")
311
+ ai_recommendations = pplx_data.get("recommendations", {})
312
+ else:
313
+ # Fallback to Gemini or defaults
314
+ try:
315
+ from .llm_generator import generate_company_description, generate_ai_recommendations
316
+ report_progress("Generating insights...", 98)
317
+ company_description = generate_company_description(company_name)
318
+
319
+ pre_result = {
320
+ "greenwashingLabel": greenwashing_flag,
321
+ "internal_documents_analysis": {"major_findings": pdf_scores['env_sentences'][:1]},
322
+ "contradictions_detected": contradictions,
323
+ "external_summary": {"public_sentiment": news_sentiment['label']}
324
+ }
325
+ ai_recommendations = generate_ai_recommendations(company_name, pre_result)
326
+ except Exception as e:
327
+ print(f"AI Generation fallback failed: {e}")
328
+ company_description = f"Analysis of {company_name}'s sustainability practices."
329
+ ai_recommendations = {
330
+ "customers": ["Review sustainability claims"],
331
+ "investors": ["Monitor ESG disclosures"],
332
+ "regulators": ["Standard compliance checks"]
333
+ }
334
+
335
+ # --- COMPOSITE SENTIMENT SCORE ---
336
+ # (calculation remains same)
337
+ internal_sentiment = analyze_sentiment(pdf_scores['env_sentences'])
338
+
339
+ def get_linear_score(s_dict):
340
+ if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50)
341
+ if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50)
342
+ return 50 # Neutral
343
+
344
+ int_s = get_linear_score(internal_sentiment)
345
+ ext_s = get_linear_score(news_sentiment)
346
+ rev_s = get_linear_score(reviews_sentiment)
347
+
348
+ composite_score = (int_s * 0.4) + (ext_s * 0.4) + (rev_s * 0.2)
349
+ composite_score_norm = composite_score / 100.0
350
+
351
+ # (AI generation already done above - using company_description and ai_recommendations)
352
+
353
+ # Update result
354
+ result = {
355
+ "company_name": company_name,
356
+ "company_description": company_description,
357
+ "last_updated": datetime.now().isoformat(),
358
+ "confidence_score": f"High ({len(news_articles) + len(reviews)} sources analyzed)",
359
+ "greenwashingLabel": greenwashing_flag, # 1 if Greenwashing, else 0 (Simplification for some binary UIs)
360
+
361
+ "detailed_scores": {
362
+ "green_keyword_frequency": round(green_keyword_freq, 3),
363
+ "vague_keyword_ratio": round(vague_ratio, 3),
364
+ "concrete_claim_ratio": round(concrete_ratio, 3),
365
+ "overall_sentiment": round(composite_score_norm, 3),
366
+ "internal_sentiment": round(internal_sentiment['score'], 3),
367
+ "external_sentiment": round(news_sentiment['score'], 3),
368
+ "external_sentiment_gap": round(ext_gap, 3),
369
+ "emission_sentiment": round(emission_sentiment['score'], 3),
370
+ "energy_sentiment": round(energy_sentiment['score'], 3),
371
+ "waste_sentiment": round(waste_sentiment['score'], 3),
372
+ "relative_focus_score": round(pdf_scores['env_count'] / max(len(pdf_sentences), 1), 3)
373
+ },
374
+
375
+ "external_summary": {
376
+ "key_highlights": [
377
+ f"Public Sentiment: {news_sentiment['label']}",
378
+ f"Risk Level: {overall_risk_str}"
379
+ ],
380
+ # ...
381
+ "public_sentiment": news_sentiment['label'],
382
+ "recent_news_summary": f"Analysis of {len(news_articles)} articles.",
383
+ "possible_bias": "None",
384
+ "evidence_links": news_articles[:5]
385
+ },
386
+
387
+ "internal_documents_analysis": {
388
+ "major_findings": pdf_scores['env_sentences'][:5],
389
+ "compliance_risks": [f"Potential risk: {s[:50]}..." for s in pdf_scores['env_sentences'] if "aims to" in s][:3],
390
+ "performance_indicators": [s for s in pdf_scores['action_sentences'] if "%" in s][:5]
391
+ },
392
+
393
+ "risk_assessment": {
394
+ "financial_risk": "High" if risk_level_code == 2 else "Low",
395
+ "reputation_risk": "Critical" if risk_level_code == 2 else ("Medium" if risk_level_code == 1 else "Low"),
396
+ "compliance_risk": "High" if risk_level_code == 2 else "Low",
397
+ "market_risk": "Medium" if final_score < 50 else "Low",
398
+ # IMPACT: 3-State Output
399
+ "overall_risk_level": overall_risk_str
400
+ },
401
+
402
+ # ... (rest same) ...
403
+ "opportunities_and_strengths": [
404
+ "Expand concrete data reporting",
405
+ "Address external contradictions explicitly"
406
+ ] if risk_level_code >= 1 else [
407
+ "Strong concrete data transparency",
408
+ "Positive external sentiment alignment"
409
+ ],
410
+
411
+ "reviews_analysis": {
412
+ "sentiment_score": reviews_sentiment['score'],
413
+ "total_reviews_analyzed": len(reviews),
414
+ "review_sources": reviews[:5]
415
+ },
416
+
417
+ "recommended_actions": ai_recommendations,
418
+
419
+ "hidden_patterns": [
420
+ {"pattern": "Vague Language", "description": "High usage of 'aims to' without dates"}
421
+ ] if vague_ratio > 0.4 else []
422
+ }
423
+
424
+ report_progress(f"Analysis complete: Score {final_score}/100", 100)
425
+ return result
app/services/hugchat_client.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from hugchat import hugchat
3
+ from hugchat.login import Login
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ HF_EMAIL = os.getenv("HUGGINGFACE_EMAIL")
9
+ HF_PASS = os.getenv("HUGGINGFACE_PASS")
10
+
11
+ # Global variables to reuse login session
12
+ _chatbot = None
13
+
14
+ def get_chatbot():
15
+ global _chatbot
16
+ if _chatbot:
17
+ return _chatbot
18
+
19
+ if not HF_EMAIL or not HF_PASS:
20
+ print("Warning: HUGGINGFACE_EMAIL or HUGGINGFACE_PASS not found.")
21
+ return None
22
+
23
+ try:
24
+ sign = Login(HF_EMAIL, HF_PASS)
25
+ cookies = sign.login()
26
+ _chatbot = hugchat.ChatBot(cookies=cookies.get_dict())
27
+ return _chatbot
28
+ except Exception as e:
29
+ print(f"HuggingChat Login Error: {e}")
30
+ return None
31
+
32
+ def generate_hugchat_response(prompt: str) -> str:
33
+ """
34
+ Generates text using HuggingChat.
35
+ """
36
+ chatbot = get_chatbot()
37
+ if not chatbot:
38
+ return "AI unavailable (Auth missing)."
39
+
40
+ try:
41
+ # Create a new conversation for isolation or reuse default
42
+ id = chatbot.new_conversation()
43
+ chatbot.change_conversation(id)
44
+
45
+ response = chatbot.chat(prompt)
46
+ text = response.wait_until_done()
47
+
48
+ # Cleanup? (Optional, but good for privacy)
49
+ # chatbot.delete_conversation(id)
50
+
51
+ return text
52
+ except Exception as e:
53
+ print(f"HuggingChat Error: {e}")
54
+ return "AI unavailable (Error)."
app/services/llm_generator.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
8
+
9
+ # if GEMINI_API_KEY:
10
+ # genai.configure(api_key=GEMINI_API_KEY)
11
+ # model = genai.GenerativeModel('gemini-2.0-flash')
12
+ # else:
13
+ model = None
14
+ print("Gemini LLM Disabled by user request.")
15
+ # print("Warning: GEMINI_API_KEY not found in .env. LLM features will be disabled.")
16
+
17
+ def generate_company_description(company_name: str) -> str:
18
+ """
19
+ Generates a brief 2-3 sentence description of the company using Gemini.
20
+ """
21
+ if not model:
22
+ return "AI description unavailable (API Key missing)."
23
+
24
+ try:
25
+ prompt = f"Provide a factual, neutral 2-3 sentence description of the company '{company_name}', focusing on its industry and main products. Do not mention sentiment or controversies."
26
+ response = model.generate_content(prompt)
27
+ return response.text.strip()
28
+ except Exception as e:
29
+ print(f"Error generating description for {company_name}: {e}")
30
+ return "AI description unavailable due to an error."
31
+
32
+ def generate_ai_recommendations(company_name: str, analysis_data: dict) -> dict:
33
+ """
34
+ Generates tailored recommendations for Customers, Investors, and Leadership based on the analysis.
35
+ """
36
+ if not model:
37
+ return {
38
+ "for_customers": ["Review provided evidence links."],
39
+ "for_investors": ["Analyze financial risks mentioned in report."],
40
+ "for_company_leadership": ["Address flagged contradictions."]
41
+ }
42
+
43
+ try:
44
+ # Construct a summary context for the LLM
45
+ context = f"""
46
+ Company: {company_name}
47
+ Greenwashing Risk: {'High' if analysis_data.get('greenwashingLabel') == 1 else 'Low'}
48
+ Reason: {analysis_data.get('internal_documents_analysis', {}).get('major_findings', ['N/A'])[0]}
49
+ Contradictions: {len(analysis_data.get('contradictions_detected', []))} found.
50
+ Sentiment: {analysis_data.get('external_summary', {}).get('public_sentiment', 'N/A')}
51
+ """
52
+
53
+ prompt = f"""
54
+ Based on the following analysis of '{company_name}', provide 3 specific, actionable recommendations for each group (Customers, Investors, Leadership).
55
+ Focus on greenwashing, transparency, and sustainability accountability.
56
+
57
+ Analysis Context:
58
+ {context}
59
+
60
+ Output purely as JSON format with keys: "for_customers", "for_investors", "for_company_leadership". Each key should have a list of strings.
61
+ Do not allow Markdown code blocks. Just raw JSON.
62
+ """
63
+
64
+ response = model.generate_content(prompt)
65
+ text = response.text.strip()
66
+
67
+ # Clean potential markdown wrapping
68
+ if text.startswith("```json"):
69
+ text = text[7:]
70
+ if text.endswith("```"):
71
+ text = text[:-3]
72
+
73
+ import json
74
+ return json.loads(text)
75
+
76
+ except Exception as e:
77
+ print(f"Error generating recommendations for {company_name}: {e}")
78
+ # Fallback
79
+ return {
80
+ "for_customers": ["Review provided evidence links.", "Cross-check claims."],
81
+ "for_investors": ["Monitor reputational risks.", "Demand clearer impact reports."],
82
+ "for_company_leadership": ["Address detected contradictions.", "Improve transparency."]
83
+ }
84
+
85
+ def generate_combined_insights(company_name: str, analysis_data: dict) -> dict:
86
+ """
87
+ Combines description and recommendations into a single API call to reduce rate limit usage.
88
+ Returns: { "description": str, "recommendations": dict }
89
+ """
90
+ if not model:
91
+ return {
92
+ "description": "AI description unavailable (API Key missing).",
93
+ "recommendations": generate_ai_recommendations(company_name, analysis_data) # Fallback to default
94
+ }
95
+
96
+ try:
97
+ context = f"""
98
+ Company: {company_name}
99
+ Greenwashing Risk: {'High' if analysis_data.get('greenwashingLabel') == 1 else 'Low'}
100
+ Reason: {analysis_data.get('internal_documents_analysis', {}).get('major_findings', ['N/A'])[0]}
101
+ """
102
+
103
+ prompt = f"""
104
+ Analyze '{company_name}' based on this context:
105
+ {context}
106
+
107
+ Provide 2 outputs in a single JSON object:
108
+ 1. "description": A factual 2-sentence description of the company.
109
+ 2. "recommendations": A dictionary with keys "for_customers", "for_investors", "for_company_leadership", containing 3 actionable tips for each.
110
+
111
+ Output purely JSON. No markdown.
112
+ """
113
+
114
+ response = model.generate_content(prompt)
115
+ text = response.text.strip()
116
+ if text.startswith("```json"): text = text[7:]
117
+ if text.endswith("```"): text = text[:-3]
118
+
119
+ return json.loads(text)
120
+ except Exception as e:
121
+ print(f"Error generating combined insights for {company_name}: {e}")
122
+ return {
123
+ "description": "AI description unavailable due to high traffic.",
124
+ "recommendations": {
125
+ "for_customers": ["Review evidence links."],
126
+ "for_investors": ["Analyze risks."],
127
+ "for_company_leadership": ["Address contradictions."]
128
+ }
129
+ }
130
+
131
+ def generate_batch_insights(companies_data: list) -> dict:
132
+ """
133
+ Generates insights for a batch of companies (up to 10-15 recommended) in a SINGLE prompt.
134
+ Input: list of {name, context: str}
135
+ Output: dict { company_name: { "description": ..., "recommendations": ... } }
136
+ """
137
+ import json
138
+ from .hugchat_client import generate_hugchat_response
139
+
140
+ # Try HuggingChat if Gemini is disabled
141
+ if not model:
142
+ # Construct Prompt for HuggingChat
143
+ batch_context = ""
144
+ for i, c in enumerate(companies_data):
145
+ batch_context += f"\n--- Company {i+1}: {c['name']} ---\n{c['context']}\n"
146
+
147
+ prompt = f"""
148
+ You are a sustainability analyst. Analyze these {len(companies_data)} companies.
149
+ {batch_context}
150
+
151
+ Return a valid JSON OBJECT where keys are company names.
152
+ For each company, provide:
153
+ 1. "description": A factual 2-sentence summary.
154
+ 2. "recommendations": Object with keys "for_customers", "for_investors", "for_company_leadership" (list of 3 tips each).
155
+
156
+ Example JSON Structure:
157
+ {{
158
+ "Company Name": {{
159
+ "description": "...",
160
+ "recommendations": {{ "for_customers": [...], ... }}
161
+ }}
162
+ }}
163
+
164
+ IMPORTANT: Output ONLY valid JSON. No Markdown. No Intro.
165
+ """
166
+
167
+ print("Using HuggingChat for Batch Analysis...")
168
+ response_text = generate_hugchat_response(prompt)
169
+
170
+ try:
171
+ # clean json
172
+ text = response_text.strip()
173
+ if text.startswith("```json"): text = text[7:]
174
+ if text.endswith("```"): text = text[:-3]
175
+ if "{" not in text: raise Exception("Invalid JSON format")
176
+
177
+ return json.loads(text)
178
+ except Exception as e:
179
+ print(f"HuggingChat Parsing Error: {e}")
180
+ # Fallthrogh to fallback
181
+ pass
182
+
183
+ if not model and not 'response_text' in locals():
184
+ # Return fallback for all
185
+ return {c['name']: {
186
+ "description": "AI unavailable (Key missing)",
187
+ "recommendations": {
188
+ "for_customers": ["Review evidence."],
189
+ "for_investors": ["Check risks."],
190
+ "for_company_leadership": ["Monitor compliance."]
191
+ }
192
+ } for c in companies_data}
193
+
194
+ try:
195
+ # ... (Gemini Logic remains as backup if re-enabled) ...
196
+ # Construct simplified context list
197
+ batch_context = ""
198
+ for i, c in enumerate(companies_data):
199
+ batch_context += f"\n--- Company {i+1}: {c['name']} ---\n{c['context']}\n"
200
+
201
+ prompt = f"""
202
+ Analyze the following {len(companies_data)} companies based on the provided contexts.
203
+ {batch_context}
204
+
205
+ For EACH company, provide:
206
+ 1. "description": A factual 2-sentence summary.
207
+ 2. "recommendations": 3 specific actionable tips per group (Customers, Investors, Leadership).
208
+
209
+ Output purely as a JSON OBJECT where keys are the exact company names and values are the insight objects.
210
+ Example:
211
+ {{
212
+ "Company A": {{ "description": "...", "recommendations": {{ ... }} }},
213
+ "Company B": ...
214
+ }}
215
+
216
+ No markdown formatting. Just JSON.
217
+ """
218
+
219
+ response = model.generate_content(prompt)
220
+ text = response.text.strip()
221
+ if text.startswith("```json"): text = text[7:]
222
+ if text.endswith("```"): text = text[:-3]
223
+
224
+ results = json.loads(text)
225
+ return results
226
+
227
+ except Exception as e:
228
+ print(f"Batch generation error: {e}")
229
+ return {}
app/services/ml_logic.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .ml_models import ml_models
2
+ from .scoring import calculate_vague_score, calculate_concrete_score, analyze_sentiment
3
+ import re
4
+ import joblib
5
+ import os
6
+ import pandas as pd
7
+ import numpy as np
8
+
9
+ # Path configurations
10
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
11
+ MODEL_DIR = os.path.join(BASE_DIR, "ml_models")
12
+
13
+ ENSEMBLE_PATH = os.path.join(MODEL_DIR, 'ensemble_model.pkl')
14
+ FEATURE_COLS_PATH = os.path.join(MODEL_DIR, 'all_feature_columns.pkl')
15
+ CAT_MAPPING_PATH = os.path.join(MODEL_DIR, 'category_to_greenwashing_mapping.pkl')
16
+ BINARY_MAPPING_PATH = os.path.join(MODEL_DIR, 'binary_to_report_name_mapping.pkl')
17
+
18
+ _ensemble_model = None
19
+ _feature_cols = None
20
+ _binary_mapping = None
21
+
22
+ def load_artifacts():
23
+ global _ensemble_model, _feature_cols, _binary_mapping
24
+
25
+ if _ensemble_model and _feature_cols:
26
+ return _ensemble_model, _feature_cols, _binary_mapping
27
+
28
+ try:
29
+ if os.path.exists(ENSEMBLE_PATH):
30
+ print(f"[ML] Loading Ensemble Model from {ENSEMBLE_PATH}...")
31
+ _ensemble_model = joblib.load(ENSEMBLE_PATH)
32
+ _feature_cols = joblib.load(FEATURE_COLS_PATH)
33
+
34
+ if os.path.exists(BINARY_MAPPING_PATH):
35
+ _binary_mapping = joblib.load(BINARY_MAPPING_PATH)
36
+ else:
37
+ # Fallback mapping if file missing
38
+ _binary_mapping = {0: 'Not Greenwashing (Low)', 1: 'Greenwashing (High/Medium)'}
39
+
40
+ print(f"[ML] Ensemble Model Loaded. Features: {_feature_cols}")
41
+ return _ensemble_model, _feature_cols, _binary_mapping
42
+ except Exception as e:
43
+ print(f"[ML] Failed to load artifacts: {e}")
44
+
45
+ return None, None, None
46
+
47
+ def train_model(data: list[dict]):
48
+ """
49
+ Legacy training function kept for compatibility but effectively disabled
50
+ as we are now using the pre-trained Ensemble Model.
51
+ """
52
+ print("[ML] Train requested, but system is now using pre-trained Ensemble Model.")
53
+ return 0.0
54
+
55
+ def predict_greenwashing_risk(text, company_name="Unknown", features_dict=None):
56
+ """
57
+ Predict greenwashing risk using Ensemble Model if features are provided.
58
+ Fallback to heuristic if only text is available.
59
+ """
60
+ model, features, binary_map = load_artifacts()
61
+
62
+ # 1. Prediction using Ensemble Model (Feature-based)
63
+ if model and features and features_dict:
64
+ try:
65
+ # Prepare input dataframe with correct column order
66
+ input_data = {}
67
+ for col in features:
68
+ # Handle typo in specific user column "frequecy"
69
+ val = features_dict.get(col)
70
+ if val is None:
71
+ # Fallback for known variations
72
+ if col == 'Green Keyword frequecy':
73
+ val = features_dict.get('Green Keyword Frequency', 0)
74
+ elif col == 'Emission Sentiment ': # Note space
75
+ val = features_dict.get('Emission Sentiment', 0)
76
+ else:
77
+ val = 0
78
+ input_data[col] = [float(val)]
79
+
80
+ df = pd.DataFrame(input_data)
81
+
82
+ # Predict
83
+ pred_binary = model.predict(df)[0]
84
+ pred_proba = model.predict_proba(df)[0] # [prob_0, prob_1]
85
+ prob_gw = pred_proba[1]
86
+
87
+ # granular mapping based on probability
88
+ if prob_gw >= 0.75:
89
+ risk_label = "High"
90
+ label_text = "High Risk"
91
+ elif prob_gw >= 0.35:
92
+ risk_label = "Medium"
93
+ label_text = "Medium Risk"
94
+ else:
95
+ risk_label = "Low"
96
+ label_text = "Low Risk"
97
+
98
+ return {
99
+ "company_name": company_name,
100
+ "greenwashing_score": round(prob_gw, 3),
101
+ "risk_label": risk_label,
102
+ "model_label": risk_label, # Use simple label for UI mapping
103
+ "details": {
104
+ "model_used": "Ensemble Voting Classifier",
105
+ "confidence": round(max(pred_proba) * 100, 1),
106
+ "features": features_dict # Return original features for UI
107
+ }
108
+ }
109
+
110
+ except Exception as e:
111
+ print(f"[ML] Ensemble prediction failed: {e}")
112
+ # Fallback to heuristic below
113
+
114
+ # 2. Heuristic Fallback (Text-based)
115
+ sentences = re.split(r'(?<=[.!?]) +', text)
116
+ vague_score = calculate_vague_score(sentences)
117
+ concrete_score = calculate_concrete_score(sentences)
118
+ sentiment = analyze_sentiment([text])
119
+
120
+ risk_score = 0.5 + (vague_score * 0.4) - (concrete_score * 0.5)
121
+ if sentiment['label'] == 'Negative':
122
+ risk_score += sentiment['score'] * 0.2
123
+ risk_score = max(0, min(1, risk_score))
124
+
125
+ return {
126
+ "company_name": company_name,
127
+ "greenwashing_score": round(risk_score, 3),
128
+ "risk_label": "High Risk" if risk_score > 0.7 else "Low Risk",
129
+ "model_label": "Heuristic Analysis",
130
+ "details": {
131
+ "vague_language_ratio": round(vague_score, 3),
132
+ "concrete_claims_ratio": round(concrete_score, 3),
133
+ "model_used": "Heuristic Fallback"
134
+ }
135
+ }
136
+
137
+
app/services/ml_models.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from transformers import pipeline
3
+ import torch
4
+
5
+ class MLModels:
6
+ _instance = None
7
+
8
+ def __new__(cls):
9
+ if cls._instance is None:
10
+ cls._instance = super(MLModels, cls).__new__(cls)
11
+ cls._instance.device = 'cuda' if torch.cuda.is_available() else 'cpu'
12
+ print(f"Loading models on {cls._instance.device}...")
13
+
14
+ # Load Sentence Transformer
15
+ cls._instance.st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=cls._instance.device)
16
+
17
+ # Load FinBERT for sentiment
18
+ cls._instance.finbert = pipeline("text-classification", model="yiyanghkust/finbert-tone", device=0 if cls._instance.device == 'cuda' else -1)
19
+
20
+ # Load ClimateBERT for ESG sentiment (optional, can be heavy)
21
+ # cls._instance.climatebert = pipeline("text-classification", model="climatebert/distilroberta-base-climate-sentiment", device=0 if cls._instance.device == 'cuda' else -1)
22
+
23
+ print("Models loaded successfully.")
24
+ return cls._instance
25
+
26
+ ml_models = MLModels()
app/services/pdf_processor.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import re
3
+
4
+ def extract_text_from_pdf(pdf_path: str) -> str:
5
+ text = ""
6
+ try:
7
+ with fitz.open(pdf_path) as doc:
8
+ for page in doc:
9
+ text += page.get_text()
10
+ except Exception as e:
11
+ print(f"Error reading PDF {pdf_path}: {e}")
12
+ return ""
13
+ return text
14
+
15
+ def split_sentences(text: str) -> list[str]:
16
+ # Simple regex split on punctuation followed by space
17
+ return re.split(r'(?<=[.!?])\s+', text)
18
+
19
+ def clean_text(text: str) -> str:
20
+ text = re.sub(r"\s+", " ", str(text)).strip()
21
+ return text
app/services/perplexity_client.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import json
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
9
+
10
+ def research_company(company_name: str) -> dict:
11
+ """
12
+ Uses Perplexity AI to conduct deep web research on a company's environmental impact.
13
+ Returns: { "description": str, "findings": list, "sentiment": str, "citations": list }
14
+ """
15
+ if not PERPLEXITY_API_KEY:
16
+ print("Warning: PERPLEXITY_API_KEY not found.")
17
+ return None
18
+
19
+ url = "https://api.perplexity.ai/chat/completions"
20
+
21
+ # Prompt designed to extract structured data compatible with our existing analysis
22
+ system_prompt = "You are an environmental analyst. Research the target company and return a JSON object with: 'description' (factual summaries), 'findings' (list of 5 key controversies or achievements), 'sentiment' (Positive/Negative/Mixed), 'citations' (list of source URLs), and 'recommendations' (object with keys 'for_customers', 'for_investors', 'for_company_leadership', each a list of 3 strings)."
23
+
24
+ user_prompt = f"Research the environmental track record of '{company_name}'. Focus on emissions, greenwashing, and sustainability 2023-2025."
25
+
26
+ payload = {
27
+ "model": "sonar",
28
+ "messages": [
29
+ {"role": "system", "content": system_prompt},
30
+ {"role": "user", "content": user_prompt}
31
+ ],
32
+ "temperature": 0.2
33
+ }
34
+
35
+ headers = {
36
+ "Authorization": f"Bearer {PERPLEXITY_API_KEY}",
37
+ "Content-Type": "application/json"
38
+ }
39
+
40
+ try:
41
+ response = requests.post(url, json=payload, headers=headers)
42
+ response.raise_for_status()
43
+
44
+ result = response.json()
45
+ content = result['choices'][0]['message']['content']
46
+ citations = result.get('citations', [])
47
+
48
+ # Clean JSON markdown if present
49
+ if content.startswith("```json"): content = content[7:]
50
+ if content.endswith("```"): content = content[:-3]
51
+
52
+ data = json.loads(content)
53
+ data['citations'] = citations # Ensure citations are attached
54
+ return data
55
+
56
+ except Exception as e:
57
+ print(f"Perplexity API Error: {e}")
58
+ return None
app/services/scoring.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from sentence_transformers import util
3
+ from .ml_models import ml_models
4
+
5
+ # Reference phrases
6
+ ENV_REF = [
7
+ "environment", "climate change", "carbon emissions", "pollution", "waste",
8
+ "green energy", "renewable resources", "sustainability", "biodiversity",
9
+ "eco-friendly", "net zero", "solar energy", "wind energy", "water conservation"
10
+ ]
11
+ ESG_REF = [
12
+ "environment", "social responsibility", "governance", "sustainability", "carbon emissions",
13
+ "green energy", "renewable resources", "waste management", "climate change", "pollution control",
14
+ "biodiversity", "eco-friendly", "net zero", "solar energy", "wind energy", "water conservation",
15
+ "community development", "employee welfare", "diversity", "ethics"
16
+ ]
17
+ ACTION_REF = [
18
+ "implemented", "adopted", "reduced emissions", "recycled", "renewable energy",
19
+ "sustainability project", "steps taken to reduce carbon emissions",
20
+ "initiatives to help the environment", "measures to prevent greenwashing"
21
+ ]
22
+ CLAIM_REF = [
23
+ "plans to achieve", "committed to", "targets", "pledges", "goal", "aims to",
24
+ "intent to reduce", "objective to be", "aims for sustainability",
25
+ "pledged to achieve", "will reduce carbon", "expect to reach net zero",
26
+ "plans to be carbon neutral by", "commitment to net zero by",
27
+ "goal to be eco friendly by", "target year for sustainability",
28
+ "striving to be net zero", "intends to adopt renewable energy", "aiming for eco-friendly operations"
29
+ ]
30
+
31
+ def semantic_matches(sentences, reference, threshold=0.55, batch_size=64):
32
+ model = ml_models.st_model
33
+ ref_emb = model.encode(reference, convert_to_tensor=True)
34
+ matches = []
35
+
36
+ # Process in batches
37
+ for i in range(0, len(sentences), batch_size):
38
+ batch = sentences[i:i+batch_size]
39
+ if not batch: continue
40
+ sent_emb = model.encode(batch, convert_to_tensor=True)
41
+ sim_matrix = util.cos_sim(sent_emb, ref_emb)
42
+
43
+ for j, sim_scores in enumerate(sim_matrix):
44
+ if sim_scores.max().item() >= threshold:
45
+ matches.append(batch[j].strip())
46
+
47
+ return matches if matches else []
48
+
49
+ def calculate_scores(sentences):
50
+ env_sentences = semantic_matches(sentences, ENV_REF)
51
+ esg_sentences = semantic_matches(sentences, ESG_REF)
52
+ action_sentences = semantic_matches(sentences, ACTION_REF)
53
+ claim_sentences = semantic_matches(sentences, CLAIM_REF, threshold=0.54)
54
+
55
+ return {
56
+ "env_count": len(env_sentences),
57
+ "esg_count": len(esg_sentences),
58
+ "action_count": len(action_sentences),
59
+ "claim_count": len(claim_sentences),
60
+ "env_sentences": env_sentences,
61
+ "action_sentences": action_sentences
62
+ }
63
+
64
+ def calculate_vague_score(sentences):
65
+ """
66
+ Calculate the ratio of sentences containing vague/future-tense language.
67
+ """
68
+ vague_patterns = [
69
+ r"aim(s|ing)? to", r"plan(s|ning)? to", r"committed to", r"strive(s|ing)? for",
70
+ r"intend(s|ing)? to", r"goal of", r"vision", r"hopefully", r"aspire(s|ing)? to",
71
+ r"future", r"potential", r"believe"
72
+ ]
73
+ regex = re.compile("|".join(vague_patterns), re.IGNORECASE)
74
+
75
+ count = 0
76
+ for sent in sentences:
77
+ if regex.search(sent):
78
+ count += 1
79
+
80
+ return count / max(len(sentences), 1)
81
+
82
+ def calculate_concrete_score(sentences):
83
+ """
84
+ Calculate the ratio of sentences containing specific, concrete metrics.
85
+ Looking for numbers followed by %, $, tons, kg, or years.
86
+ """
87
+ concrete_patterns = [
88
+ r"\d+(\.\d+)?%", # Percentages
89
+ r"\$\d+", # Money
90
+ r"\d+ (tons|kg|metric tons|tonnes)", # Weight
91
+ r"by 20\d{2}", # Years (e.g. by 2030)
92
+ r"reduced by", r"achieved", r"completed" # Past tense concrete verbs
93
+ ]
94
+ regex = re.compile("|".join(concrete_patterns), re.IGNORECASE)
95
+
96
+ count = 0
97
+ for sent in sentences:
98
+ if regex.search(sent):
99
+ count += 1
100
+
101
+ return count / max(len(sentences), 1)
102
+
103
+ def analyze_sentiment(text_chunks):
104
+ # Use FinBERT for sentiment
105
+ results = []
106
+ for chunk in text_chunks:
107
+ # Truncate to 1500 chars (approx 300-400 tokens) to be safe
108
+ if len(chunk) > 1500: chunk = chunk[:1500]
109
+ try:
110
+ res = ml_models.finbert(chunk, truncation=True, max_length=512)
111
+ results.append(res[0]) # [{'label': 'Positive', 'score': 0.9}]
112
+ except Exception as e:
113
+ print(f"Sentiment error: {e}")
114
+
115
+ # Aggregate
116
+ if not results: return {"label": "Neutral", "score": 0.5}
117
+
118
+ pos = sum(1 for r in results if r['label'] == 'Positive')
119
+ neg = sum(1 for r in results if r['label'] == 'Negative')
120
+ neu = sum(1 for r in results if r['label'] == 'Neutral')
121
+
122
+ total = len(results)
123
+ if pos > neg and pos > neu: return {"label": "Positive", "score": pos/total}
124
+ if neg > pos and neg > neu: return {"label": "Negative", "score": neg/total}
125
+ return {"label": "Neutral", "score": neu/total}
126
+
127
+ def analyze_aspect_sentiment(text_chunks, aspect_keywords):
128
+ """
129
+ Analyze sentiment only for chunks containing specific keywords
130
+ """
131
+ aspect_chunks = []
132
+ for chunk in text_chunks:
133
+ if any(keyword in chunk.lower() for keyword in aspect_keywords):
134
+ aspect_chunks.append(chunk)
135
+
136
+ if not aspect_chunks:
137
+ return {"label": "Neutral", "score": 0.5}
138
+
139
+ return analyze_sentiment(aspect_chunks)
app/services/scraper.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ import requests
4
+ import logging
5
+ from fake_useragent import UserAgent
6
+ try:
7
+ from ddgs import DDGS
8
+ except ImportError:
9
+ from duckduckgo_search import DDGS
10
+ from selenium import webdriver
11
+ from selenium.webdriver.chrome.options import Options
12
+ from selenium.webdriver.chrome.service import Service
13
+ from selenium_stealth import stealth
14
+ from webdriver_manager.chrome import ChromeDriverManager
15
+ from bs4 import BeautifulSoup
16
+
17
+ # Configure logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ ua = UserAgent()
22
+
23
+ # Progress tracking
24
+ progress_callback = None
25
+
26
+ def set_progress_callback(callback):
27
+ """Set a callback function to report progress"""
28
+ global progress_callback
29
+ progress_callback = callback
30
+
31
+ def report_progress(message, percentage):
32
+ """Report progress if callback is set"""
33
+ if progress_callback:
34
+ progress_callback(message, percentage)
35
+ print(f"[{percentage}%] {message}")
36
+
37
+ def setup_selenium_driver():
38
+ """Setup a stealth Selenium driver with HuggingFace/Docker compatibility"""
39
+ options = Options()
40
+ options.add_argument("--headless=new") # New headless mode
41
+ options.add_argument("--no-sandbox")
42
+ options.add_argument("--disable-dev-shm-usage")
43
+ options.add_argument("--disable-gpu")
44
+ options.add_argument("--disable-extensions")
45
+ options.add_argument("--disable-infobars")
46
+ options.add_argument("--window-size=1920,1080")
47
+ options.add_argument(f"user-agent={ua.random}")
48
+ options.add_experimental_option("excludeSwitches", ["enable-automation"])
49
+ options.add_experimental_option('useAutomationExtension', False)
50
+
51
+ # Check if running in Docker/HuggingFace environment
52
+ is_docker = os.path.exists("/.dockerenv") or os.environ.get("HF_SPACE_ID")
53
+
54
+ driver = None
55
+
56
+ if is_docker:
57
+ logger.info("Running in Docker/HuggingFace environment, using system Chromium")
58
+ # Use system Chromium in Docker
59
+ chromium_paths = ["/usr/bin/chromium", "/usr/bin/chromium-browser", "/usr/bin/google-chrome"]
60
+ chromedriver_paths = ["/usr/bin/chromedriver", "/usr/local/bin/chromedriver"]
61
+
62
+ for chromium_path in chromium_paths:
63
+ if os.path.exists(chromium_path):
64
+ options.binary_location = chromium_path
65
+ logger.info(f"Using Chromium at: {chromium_path}")
66
+ break
67
+
68
+ try:
69
+ # Try with system chromedriver first
70
+ for chromedriver_path in chromedriver_paths:
71
+ if os.path.exists(chromedriver_path):
72
+ service = Service(chromedriver_path)
73
+ driver = webdriver.Chrome(service=service, options=options)
74
+ logger.info(f"Using chromedriver at: {chromedriver_path}")
75
+ break
76
+
77
+ if driver is None:
78
+ # Fallback to webdriver_manager
79
+ service = Service(ChromeDriverManager().install())
80
+ driver = webdriver.Chrome(service=service, options=options)
81
+ except Exception as e:
82
+ logger.error(f"Docker Chrome setup failed: {e}")
83
+ # Final fallback - try default Chrome
84
+ try:
85
+ driver = webdriver.Chrome(options=options)
86
+ except Exception as e2:
87
+ logger.error(f"All Chrome drivers failed: {e2}")
88
+ raise
89
+ else:
90
+ # Local development - use webdriver_manager
91
+ try:
92
+ service = Service(ChromeDriverManager().install())
93
+ driver = webdriver.Chrome(service=service, options=options)
94
+ except Exception as e:
95
+ logger.error(f"Failed to initialize Chrome driver with manager: {e}")
96
+ driver = webdriver.Chrome(options=options)
97
+
98
+ # Apply stealth settings
99
+ stealth(driver,
100
+ languages=["en-US", "en"],
101
+ vendor="Google Inc.",
102
+ platform="Win32",
103
+ webgl_vendor="Intel Inc.",
104
+ renderer="Intel Iris OpenGL Engine",
105
+ fix_hairline=True,
106
+ )
107
+
108
+ return driver
109
+
110
+ async def scrape_url_selenium(url):
111
+ """Scrape a URL using Selenium Stealth for better evasion"""
112
+ logger.info(f"Scraping with Selenium: {url}")
113
+ try:
114
+ def _selenium_task():
115
+ driver = setup_selenium_driver()
116
+ try:
117
+ driver.get(url)
118
+ # Wait for some content (simple sleep for now, could be improved with WebDriverWait)
119
+ import time
120
+ time.sleep(3)
121
+ content = driver.page_source
122
+ return content
123
+ finally:
124
+ driver.quit()
125
+
126
+ content = await asyncio.to_thread(_selenium_task)
127
+
128
+ # Parse with BS4 to get clean text
129
+ soup = BeautifulSoup(content, 'html.parser')
130
+ # Remove script and style elements
131
+ for script in soup(["script", "style"]):
132
+ script.decompose()
133
+ text = soup.get_text(separator=' ', strip=True)
134
+ return text, content
135
+
136
+ except Exception as e:
137
+ logger.error(f"Selenium scraping failed for {url}: {e}")
138
+ return "", ""
139
+
140
+ async def search_web(query, max_results=5):
141
+ """
142
+ Search the web using DuckDuckGo (no API key required)
143
+ """
144
+ try:
145
+ results = []
146
+ # specific implementation for DuckDuckGo might need sync wrapper if library is sync-only
147
+ # DDGS().text() is synchronous generator
148
+
149
+ def run_search():
150
+ with DDGS() as ddgs:
151
+ return list(ddgs.text(query, max_results=max_results))
152
+
153
+ # Run sync search in thread
154
+ search_results = await asyncio.to_thread(run_search)
155
+
156
+ for res in search_results:
157
+ results.append({
158
+ "title": res.get('title', ''),
159
+ "url": res.get('href', ''),
160
+ "content": res.get('body', ''),
161
+ "query_type": "web_search"
162
+ })
163
+
164
+ return results
165
+ except Exception as e:
166
+ print(f"Search error for '{query}': {e}")
167
+ return []
168
+
169
+ async def get_news_from_api(company_name):
170
+ """
171
+ Use NewsAPI for reliable news collection
172
+ """
173
+ api_key = os.getenv('NEWS_API_KEY')
174
+ if not api_key:
175
+ return []
176
+
177
+ try:
178
+ url = f"https://newsapi.org/v2/everything"
179
+ params = {
180
+ 'q': f'{company_name} AND (sustainability OR greenwashing OR ESG OR environmental)',
181
+ 'language': 'en',
182
+ 'sortBy': 'relevancy',
183
+ 'pageSize': 15,
184
+ 'apiKey': api_key
185
+ }
186
+
187
+ # Requests is blocking, so we run it in a thread to verify
188
+ response = await asyncio.to_thread(requests.get, url, params=params, timeout=10)
189
+ data = response.json()
190
+
191
+ if data.get('status') == 'ok':
192
+ articles = []
193
+ for article in data.get('articles', []):
194
+ # Filter out removed content
195
+ if article.get('title') == '[Removed]': continue
196
+
197
+ # KEYWORD FILTERS (Same as Web Search)
198
+ title_lower = (article.get('title') or "").lower()
199
+ desc_lower = (article.get('description') or "").lower()
200
+ text_to_check = title_lower + " " + desc_lower
201
+
202
+ # 1. NEGATIVE FILTER: Exclude crime/fraud
203
+ bad_keywords = ["fraud", "arrest", "scam", "police", "laundering", "jail", "cbi", "ed", "bribe", "punish", "litigation"]
204
+ if any(bad in title_lower for bad in bad_keywords):
205
+ continue
206
+
207
+ # 2. POSITIVE FILTER: Must have ESG context (If query logic fails)
208
+ # NewsAPI query already has keywords, but let's double check to be safe
209
+ pass # Relying on API query "AND (sustainability OR ...)" for now
210
+
211
+
212
+ articles.append({
213
+ 'url': article.get('url', ''),
214
+ 'title': article.get('title', ''),
215
+ 'content': (article.get('description') or '') + ' ' + (article.get('content') or ''),
216
+ 'query_type': 'news_api'
217
+ })
218
+ return articles
219
+ except Exception as e:
220
+ print(f"NewsAPI error: {e}")
221
+
222
+ return []
223
+
224
+ # Helper for Filtering
225
+ def is_valid_result(res):
226
+ """Filter out navigational, login, and irrelevant links"""
227
+ url = res.get('url', '').lower()
228
+ title = res.get('title', '').lower()
229
+ content = res.get('content', '').lower()
230
+
231
+ # 1. Exclude generic Google/Navigational links
232
+ invalid_domains = ['google.com/search', 'google.com/url', 'accounts.google.com', 'support.google.com',
233
+ 'youtube.com', 'facebook.com', 'twitter.com/login', 'linkedin.com/login']
234
+
235
+ # 2. Exclude actions
236
+ invalid_terms = ['sign in', 'log in', 'forgot password', 'download', 'captcha', 'security check', 'robot', 'access denied']
237
+
238
+ if any(d in url for d in invalid_domains): return False
239
+ if any(t in title for t in invalid_terms): return False
240
+
241
+ # 3. Minimum content length/quality (for reviews)
242
+ # if len(content) < 20: return False # Optional rule
243
+
244
+ return True
245
+
246
+ async def get_company_news(company_name):
247
+ """Get news using NewsAPI and DuckDuckGo Fallback"""
248
+ report_progress(f"Starting news collection for {company_name}", 10)
249
+
250
+ articles = []
251
+ # 1. Try NewsAPI (Limit increased to 20)
252
+ report_progress("Checking NewsAPI...", 15)
253
+ api_articles = await get_news_from_api(company_name)
254
+ articles.extend(api_articles)
255
+
256
+ # 2. Add Web Search (DuckDuckGo) for deeper coverage
257
+ report_progress("Fetching additional news via Web Search...", 25)
258
+
259
+ queries = [
260
+ f'"{company_name}" environmental impact report news',
261
+ f'"{company_name}" greenwashing controversy scandal',
262
+ f'"{company_name}" sustainability goals criticism',
263
+ f'"{company_name}" ESG rating news detected',
264
+ f'"{company_name}" climate change commitments review'
265
+ ]
266
+
267
+ # ESG/Climate Keywords (Refined to avoid generic matches)
268
+ ESG_KEYWORDS = [
269
+ "climate", "carbon", "emission", "pollution", "sustainability", "esg",
270
+ "renewable", "net zero", "biodiversity", "ecological", "greenhouse", "fossil fuel"
271
+ ]
272
+ # "green" and "environment" removed as they match "green light", "business environment"
273
+
274
+ # Negative Keywords to exclude financial crime/generic news
275
+ NEGATIVE_KEYWORDS = ["fraud", "arrest", "scam", "police", "laundering", "jail", "cbi", "ed", "bribe"]
276
+
277
+ for query in queries:
278
+ if len(articles) >= 20: break
279
+
280
+ results = await search_web(query, max_results=5)
281
+ for res in results:
282
+ if not is_valid_result(res): continue
283
+
284
+ # Combine Title + Body for checking
285
+ text_to_check = (res.get('title', '') + " " + res.get('body', '')).lower()
286
+ title_lower = res.get('title', '').lower()
287
+
288
+ # 1. NEGATIVE FILTER: Exclude crime/fraud immediately
289
+ if any(bad in title_lower for bad in NEGATIVE_KEYWORDS):
290
+ continue
291
+
292
+ # 2. POSITIVE FILTER: Must have ESG context
293
+ # Re-adding "environmental" specifically (not just environment)
294
+ if "environmental" in text_to_check: pass
295
+ elif not any(k in text_to_check for k in ESG_KEYWORDS):
296
+ continue # Skip if no environmental context found
297
+
298
+ # Simple de-duplication
299
+ if not any(a['url'] == res['url'] for a in articles):
300
+ articles.append(res)
301
+
302
+ report_progress(f"News collection complete: {len(articles)} articles", 45)
303
+ return articles[:20]
304
+
305
+ async def get_company_reviews(company_name):
306
+ """Get reviews using Web Search (Glassdoor, Reddit, etc.)"""
307
+ report_progress(f"Starting review collection for {company_name}", 50)
308
+
309
+ reviews = []
310
+
311
+ # Using site: operators to force specific sources
312
+ queries = [
313
+ f'site:glassdoor.com "{company_name}" reviews "environment" OR "sustainability"',
314
+ f'site:reddit.com "{company_name}" greenwashing OR "toxic"',
315
+ f'site:trustpilot.com "{company_name}" environment',
316
+ f'"{company_name}" employee reviews sustainability ethics',
317
+ f'"{company_name}" environmental controversy reviews', # Broad fallback
318
+ f'"{company_name}" corporate responsibility feedback' # Broad fallback
319
+ ]
320
+
321
+ total_queries = len(queries)
322
+ for idx, query in enumerate(queries):
323
+ progress = 50 + (idx / total_queries) * 30
324
+ report_progress(f"Searching specific reviews: {query}", int(progress))
325
+
326
+ results = await search_web(query, max_results=8)
327
+
328
+ for res in results:
329
+ if len(reviews) >= 40: break
330
+ if not is_valid_result(res): continue # FILTER HERE
331
+
332
+ # RELEVANCE CHECK (Strict)
333
+ # Ensure company name is actually mentioned in title or snippet
334
+ c_name_lower = company_name.lower()
335
+ res_content = (res.get('title', '') + " " + res.get('content', '')).lower()
336
+
337
+ # Simple substring match (can be improved with fuzzy later if needed)
338
+ if c_name_lower not in res_content and c_name_lower.split()[0] not in res_content:
339
+ # Try strict full name, then at least first word (e.g. "Google" in "Google Inc")
340
+ # But careful with generic first words like "The" or "Green"
341
+ if len(c_name_lower.split()[0]) > 3:
342
+ if c_name_lower.split()[0] not in res_content:
343
+ print(f"Skipping unrelated result: {res['title']}")
344
+ continue
345
+ else:
346
+ continue # Too short, require full name match
347
+
348
+ # Determine source type based on URL
349
+ source = "web"
350
+ if "glassdoor" in res['url']: source = "Glassdoor"
351
+ elif "twitter" in res['url'] or "x.com" in res['url']: source = "Twitter"
352
+ elif "linkedin" in res['url']: source = "LinkedIn"
353
+ elif "reddit" in res['url']: source = "Reddit"
354
+ elif "trustpilot" in res['url']: source = "Trustpilot"
355
+
356
+ # Clean title
357
+ title = res['title'].replace(" | Glassdoor", "").replace(" | Reddit", "")
358
+
359
+ reviews.append({
360
+ "url": res['url'],
361
+ "title": title,
362
+ "content": res['content'], # Use the snippet as the review content
363
+ "source_type": source
364
+ })
365
+
366
+ await asyncio.sleep(1)
367
+
368
+ # If few reviews found, try a broader fallback
369
+ if len(reviews) < 3:
370
+ report_progress("Few reviews found, trying specific broader query...", 75)
371
+ fallback_results = await search_web(f'"{company_name}" reviews environment', max_results=5)
372
+ for res in fallback_results:
373
+ if is_valid_result(res) and not any(r['url'] == res['url'] for r in reviews):
374
+ # RELEVANCE CHECK
375
+ c_name_lower = company_name.lower()
376
+ res_content = (res.get('title', '') + " " + res.get('content', '')).lower()
377
+ if c_name_lower not in res_content and c_name_lower.split()[0] not in res_content:
378
+ if len(c_name_lower.split()[0]) > 3:
379
+ if c_name_lower.split()[0] not in res_content: continue
380
+ else: continue
381
+
382
+ reviews.append({
383
+ "url": res['url'],
384
+ "title": res['title'],
385
+ "content": res['content'],
386
+ "source_type": "Web Search"
387
+ })
388
+
389
+ report_progress(f"Review collection complete: {len(reviews)} reviews", 80)
390
+ return reviews
391
+
392
+ # NO MOCK DATA FALLBACK
393
+ return reviews
binary_to_report_name_mapping.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11dd0280ff81b2d788bfdd2a3a44071c0b1ef7c8747e82c39220e3a776a9c2a1
3
+ size 74
category_to_greenwashing_mapping.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a24eec4ecfb676159ea79d3f645867058917b1655d594351f7d049c9b51c6740
3
+ size 44
ensemble_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:354f730c99ba19e50a0e0a26bfd214906485401866b8c748995ba10d66b19fc6
3
+ size 246560
ml_models/all_feature_columns.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f0b1ae01441008b1d591702001ef5da622b49120de397b6aefe19131d2fb9cb
3
+ size 219
ml_models/binary_to_report_name_mapping.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11dd0280ff81b2d788bfdd2a3a44071c0b1ef7c8747e82c39220e3a776a9c2a1
3
+ size 74
ml_models/category_to_greenwashing_mapping.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a24eec4ecfb676159ea79d3f645867058917b1655d594351f7d049c9b51c6740
3
+ size 44
ml_models/ensemble_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:354f730c99ba19e50a0e0a26bfd214906485401866b8c748995ba10d66b19fc6
3
+ size 246560