Spaces:
Sleeping
Sleeping
Deploy GreenIntellect Backend API with ML models and scraping
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +26 -0
- .gitattributes +2 -33
- Dockerfile +72 -0
- README.md +48 -5
- all_feature_columns.pkl +3 -0
- app/__init__.py +0 -0
- app/__pycache__/__init__.cpython-310.pyc +0 -0
- app/__pycache__/__init__.cpython-311.pyc +0 -0
- app/__pycache__/main.cpython-310.pyc +0 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/api/__pycache__/endpoints.cpython-310.pyc +0 -0
- app/api/__pycache__/endpoints.cpython-311.pyc +0 -0
- app/api/endpoints.py +477 -0
- app/db/__pycache__/models.cpython-310.pyc +0 -0
- app/db/__pycache__/models.cpython-311.pyc +0 -0
- app/db/__pycache__/session.cpython-310.pyc +0 -0
- app/db/__pycache__/session.cpython-311.pyc +0 -0
- app/db/models.py +37 -0
- app/db/session.py +20 -0
- app/main.py +33 -0
- app/services/__pycache__/analysis_engine.cpython-310.pyc +0 -0
- app/services/__pycache__/analysis_engine.cpython-311.pyc +0 -0
- app/services/__pycache__/hugchat_client.cpython-311.pyc +0 -0
- app/services/__pycache__/llm_generator.cpython-311.pyc +0 -0
- app/services/__pycache__/ml_logic.cpython-311.pyc +0 -0
- app/services/__pycache__/ml_models.cpython-310.pyc +0 -0
- app/services/__pycache__/ml_models.cpython-311.pyc +0 -0
- app/services/__pycache__/pdf_processor.cpython-310.pyc +0 -0
- app/services/__pycache__/pdf_processor.cpython-311.pyc +0 -0
- app/services/__pycache__/perplexity_client.cpython-311.pyc +0 -0
- app/services/__pycache__/scoring.cpython-310.pyc +0 -0
- app/services/__pycache__/scoring.cpython-311.pyc +0 -0
- app/services/__pycache__/scraper.cpython-310.pyc +0 -0
- app/services/__pycache__/scraper.cpython-311.pyc +0 -0
- app/services/analysis_engine.py +425 -0
- app/services/hugchat_client.py +54 -0
- app/services/llm_generator.py +229 -0
- app/services/ml_logic.py +137 -0
- app/services/ml_models.py +26 -0
- app/services/pdf_processor.py +21 -0
- app/services/perplexity_client.py +58 -0
- app/services/scoring.py +139 -0
- app/services/scraper.py +393 -0
- binary_to_report_name_mapping.pkl +3 -0
- category_to_greenwashing_mapping.pkl +3 -0
- ensemble_model.pkl +3 -0
- ml_models/all_feature_columns.pkl +3 -0
- ml_models/binary_to_report_name_mapping.pkl +3 -0
- ml_models/category_to_greenwashing_mapping.pkl +3 -0
- ml_models/ensemble_model.pkl +3 -0
.dockerignore
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
venv/
|
| 6 |
+
.venv/
|
| 7 |
+
|
| 8 |
+
# Database (created at runtime)
|
| 9 |
+
*.db
|
| 10 |
+
|
| 11 |
+
# Uploads (created at runtime)
|
| 12 |
+
uploads/
|
| 13 |
+
|
| 14 |
+
# Environment files
|
| 15 |
+
.env
|
| 16 |
+
.env.local
|
| 17 |
+
|
| 18 |
+
# IDE
|
| 19 |
+
.vscode/
|
| 20 |
+
.idea/
|
| 21 |
+
|
| 22 |
+
# Logs
|
| 23 |
+
*.log
|
| 24 |
+
|
| 25 |
+
# Git
|
| 26 |
+
.git/
|
.gitattributes
CHANGED
|
@@ -1,35 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# Git LFS for large model files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Spaces - GreenIntellect Backend API
|
| 2 |
+
# Python FastAPI + ML Models + Scraping
|
| 3 |
+
|
| 4 |
+
FROM python:3.11-slim
|
| 5 |
+
|
| 6 |
+
# Create non-root user (required by Hugging Face)
|
| 7 |
+
RUN useradd -m -u 1000 user
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
# Install system dependencies for Selenium/Chromium
|
| 11 |
+
RUN apt-get update && apt-get install -y \
|
| 12 |
+
curl \
|
| 13 |
+
wget \
|
| 14 |
+
gnupg \
|
| 15 |
+
chromium \
|
| 16 |
+
chromium-driver \
|
| 17 |
+
fonts-liberation \
|
| 18 |
+
libasound2 \
|
| 19 |
+
libatk-bridge2.0-0 \
|
| 20 |
+
libatk1.0-0 \
|
| 21 |
+
libatspi2.0-0 \
|
| 22 |
+
libcups2 \
|
| 23 |
+
libdbus-1-3 \
|
| 24 |
+
libdrm2 \
|
| 25 |
+
libgbm1 \
|
| 26 |
+
libgtk-3-0 \
|
| 27 |
+
libnspr4 \
|
| 28 |
+
libnss3 \
|
| 29 |
+
libwayland-client0 \
|
| 30 |
+
libxcomposite1 \
|
| 31 |
+
libxdamage1 \
|
| 32 |
+
libxfixes3 \
|
| 33 |
+
libxkbcommon0 \
|
| 34 |
+
libxrandr2 \
|
| 35 |
+
xdg-utils \
|
| 36 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 37 |
+
|
| 38 |
+
# Set Chrome environment variables
|
| 39 |
+
ENV CHROME_BIN=/usr/bin/chromium
|
| 40 |
+
ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
|
| 41 |
+
|
| 42 |
+
# Copy and install Python dependencies
|
| 43 |
+
COPY requirements.txt /app/requirements.txt
|
| 44 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 45 |
+
pip install --no-cache-dir --timeout=300 -r /app/requirements.txt
|
| 46 |
+
|
| 47 |
+
# Copy ML model files
|
| 48 |
+
COPY ensemble_model.pkl /app/ensemble_model.pkl
|
| 49 |
+
COPY all_feature_columns.pkl /app/all_feature_columns.pkl
|
| 50 |
+
COPY binary_to_report_name_mapping.pkl /app/binary_to_report_name_mapping.pkl
|
| 51 |
+
COPY category_to_greenwashing_mapping.pkl /app/category_to_greenwashing_mapping.pkl
|
| 52 |
+
|
| 53 |
+
# Copy backend application
|
| 54 |
+
COPY app /app/app
|
| 55 |
+
COPY ml_models /app/ml_models
|
| 56 |
+
|
| 57 |
+
# Create directories
|
| 58 |
+
RUN mkdir -p /app/uploads && chown -R user:user /app
|
| 59 |
+
|
| 60 |
+
# Switch to non-root user
|
| 61 |
+
USER user
|
| 62 |
+
|
| 63 |
+
# Environment variables
|
| 64 |
+
ENV PORT=7860
|
| 65 |
+
ENV HOST=0.0.0.0
|
| 66 |
+
ENV PYTHONUNBUFFERED=1
|
| 67 |
+
ENV PYTHONPATH=/app
|
| 68 |
+
|
| 69 |
+
EXPOSE 7860
|
| 70 |
+
|
| 71 |
+
# Start FastAPI
|
| 72 |
+
CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,11 +1,54 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: GreenIntellect API
|
| 3 |
+
emoji: 🌿
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# 🌿 GreenIntellect API
|
| 12 |
+
|
| 13 |
+
AI-powered API for analyzing sustainability reports and detecting greenwashing.
|
| 14 |
+
|
| 15 |
+
## API Endpoints
|
| 16 |
+
|
| 17 |
+
| Endpoint | Method | Description |
|
| 18 |
+
|----------|--------|-------------|
|
| 19 |
+
| `/api/` | GET | API health check |
|
| 20 |
+
| `/api/analyze` | POST | Analyze text for greenwashing |
|
| 21 |
+
| `/api/upload` | POST | Upload PDF for analysis |
|
| 22 |
+
| `/api/requests` | GET | Get analysis requests |
|
| 23 |
+
| `/` | GET | API welcome message |
|
| 24 |
+
|
| 25 |
+
## Usage
|
| 26 |
+
|
| 27 |
+
```python
|
| 28 |
+
import requests
|
| 29 |
+
|
| 30 |
+
# Analyze text
|
| 31 |
+
response = requests.post(
|
| 32 |
+
"https://tanxshh-greenintellect.hf.space/api/analyze",
|
| 33 |
+
json={"company_name": "Example Corp", "text": "Our sustainable practices..."}
|
| 34 |
+
)
|
| 35 |
+
print(response.json())
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Features
|
| 39 |
+
|
| 40 |
+
- 📄 PDF/Text Analysis
|
| 41 |
+
- 🔍 Greenwashing Detection
|
| 42 |
+
- 📊 Sentiment Analysis
|
| 43 |
+
- 🌐 Web Scraping (News & Reviews)
|
| 44 |
+
- 🤖 AI-powered Insights
|
| 45 |
+
|
| 46 |
+
## Technology
|
| 47 |
+
|
| 48 |
+
- FastAPI + Python 3.11
|
| 49 |
+
- FinBERT & Sentence Transformers
|
| 50 |
+
- Selenium + Chromium for scraping
|
| 51 |
+
- SQLite Database
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
Built with ❤️ for a sustainable future
|
all_feature_columns.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8f0b1ae01441008b1d591702001ef5da622b49120de397b6aefe19131d2fb9cb
|
| 3 |
+
size 219
|
app/__init__.py
ADDED
|
File without changes
|
app/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (113 Bytes). View file
|
|
|
app/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (179 Bytes). View file
|
|
|
app/__pycache__/main.cpython-310.pyc
ADDED
|
Binary file (952 Bytes). View file
|
|
|
app/__pycache__/main.cpython-311.pyc
ADDED
|
Binary file (1.53 kB). View file
|
|
|
app/api/__pycache__/endpoints.cpython-310.pyc
ADDED
|
Binary file (4.17 kB). View file
|
|
|
app/api/__pycache__/endpoints.cpython-311.pyc
ADDED
|
Binary file (25.7 kB). View file
|
|
|
app/api/endpoints.py
ADDED
|
@@ -0,0 +1,477 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException
|
| 2 |
+
from sqlalchemy.orm import Session
|
| 3 |
+
from typing import List
|
| 4 |
+
import shutil
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import csv
|
| 9 |
+
import io
|
| 10 |
+
import time
|
| 11 |
+
import random
|
| 12 |
+
from ..db.session import get_db
|
| 13 |
+
from ..db.models import Company, AnalysisRequest
|
| 14 |
+
from ..services.analysis_engine import analyze_company
|
| 15 |
+
from ..services.ml_logic import predict_greenwashing_risk
|
| 16 |
+
|
| 17 |
+
router = APIRouter()
|
| 18 |
+
|
| 19 |
+
UPLOAD_DIR = "uploads"
|
| 20 |
+
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
@router.post("/requests")
|
| 23 |
+
async def create_request(
|
| 24 |
+
company_name: str = Form(...),
|
| 25 |
+
file: UploadFile = File(...),
|
| 26 |
+
db: Session = Depends(get_db)
|
| 27 |
+
):
|
| 28 |
+
# Save file
|
| 29 |
+
file_path = os.path.join(UPLOAD_DIR, file.filename)
|
| 30 |
+
with open(file_path, "wb") as buffer:
|
| 31 |
+
shutil.copyfileobj(file.file, buffer)
|
| 32 |
+
|
| 33 |
+
# Create Request Record (Pending)
|
| 34 |
+
db_request = AnalysisRequest(
|
| 35 |
+
user_id="demo-user", # Replace with auth
|
| 36 |
+
company_name=company_name,
|
| 37 |
+
document_name=file.filename,
|
| 38 |
+
document_content=file_path, # Store path temporarily or extract text later
|
| 39 |
+
status="pending"
|
| 40 |
+
)
|
| 41 |
+
db.add(db_request)
|
| 42 |
+
db.commit()
|
| 43 |
+
db.refresh(db_request)
|
| 44 |
+
|
| 45 |
+
return db_request
|
| 46 |
+
|
| 47 |
+
@router.post("/requests/{id}/approve")
|
| 48 |
+
async def approve_request(id: int, db: Session = Depends(get_db)):
|
| 49 |
+
db_request = db.query(AnalysisRequest).filter(AnalysisRequest.id == id).first()
|
| 50 |
+
if not db_request:
|
| 51 |
+
raise HTTPException(status_code=404, detail="Request not found")
|
| 52 |
+
|
| 53 |
+
if db_request.status != "pending":
|
| 54 |
+
raise HTTPException(status_code=400, detail="Request already processed")
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
# Update status
|
| 58 |
+
db_request.status = "processing"
|
| 59 |
+
db.commit()
|
| 60 |
+
|
| 61 |
+
# Run Analysis
|
| 62 |
+
# Note: document_content currently holds the file path from create_request
|
| 63 |
+
file_path = db_request.document_content
|
| 64 |
+
result = await analyze_company(db_request.company_name, file_path)
|
| 65 |
+
|
| 66 |
+
# Update Request
|
| 67 |
+
db_request.status = "completed"
|
| 68 |
+
db_request.analysis_result = result
|
| 69 |
+
|
| 70 |
+
# Update or Create Company Record
|
| 71 |
+
company = db.query(Company).filter(Company.name == db_request.company_name).first()
|
| 72 |
+
if not company:
|
| 73 |
+
company = Company(name=db_request.company_name)
|
| 74 |
+
db.add(company)
|
| 75 |
+
|
| 76 |
+
company.analysis_result = result
|
| 77 |
+
company.last_analysis_date = datetime.now()
|
| 78 |
+
|
| 79 |
+
db.commit()
|
| 80 |
+
|
| 81 |
+
return result
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
db_request.status = "failed"
|
| 85 |
+
db_request.rejection_reason = str(e)
|
| 86 |
+
db.commit()
|
| 87 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 88 |
+
|
| 89 |
+
@router.post("/requests/{id}/reject")
|
| 90 |
+
def reject_request(id: int, reason: str = Form(...), db: Session = Depends(get_db)):
|
| 91 |
+
db_request = db.query(AnalysisRequest).filter(AnalysisRequest.id == id).first()
|
| 92 |
+
if not db_request:
|
| 93 |
+
raise HTTPException(status_code=404, detail="Request not found")
|
| 94 |
+
|
| 95 |
+
# Delete the request
|
| 96 |
+
db.delete(db_request)
|
| 97 |
+
db.commit()
|
| 98 |
+
return {"message": f"Request for {db_request.company_name} rejected and deleted", "reason": reason}
|
| 99 |
+
|
| 100 |
+
@router.get("/requests")
|
| 101 |
+
def get_requests(db: Session = Depends(get_db)):
|
| 102 |
+
return db.query(AnalysisRequest).all()
|
| 103 |
+
|
| 104 |
+
@router.get("/companies")
|
| 105 |
+
def get_companies(db: Session = Depends(get_db)):
|
| 106 |
+
return db.query(Company).all()
|
| 107 |
+
|
| 108 |
+
@router.post("/companies/bulk")
|
| 109 |
+
def bulk_import_companies(companies: List[dict], db: Session = Depends(get_db)):
|
| 110 |
+
"""Bulk import companies from CSV or other sources"""
|
| 111 |
+
imported = []
|
| 112 |
+
for company_data in companies:
|
| 113 |
+
# Check if company already exists
|
| 114 |
+
existing = db.query(Company).filter(Company.name == company_data.get("name")).first()
|
| 115 |
+
if existing:
|
| 116 |
+
# Update existing
|
| 117 |
+
existing.analysis_result = company_data.get("analysis")
|
| 118 |
+
existing.last_analysis_date = datetime.now()
|
| 119 |
+
existing.description = company_data.get("description", existing.description)
|
| 120 |
+
existing.website = company_data.get("website", existing.website)
|
| 121 |
+
imported.append(existing)
|
| 122 |
+
else:
|
| 123 |
+
# Create new
|
| 124 |
+
new_company = Company(
|
| 125 |
+
name=company_data.get("name"),
|
| 126 |
+
description=company_data.get("description", ""),
|
| 127 |
+
website=company_data.get("website", ""),
|
| 128 |
+
analysis_result=company_data.get("analysis"),
|
| 129 |
+
last_analysis_date=datetime.now()
|
| 130 |
+
)
|
| 131 |
+
db.add(new_company)
|
| 132 |
+
imported.append(new_company)
|
| 133 |
+
|
| 134 |
+
db.commit()
|
| 135 |
+
return {"imported": len(imported), "companies": [c.name for c in imported]}
|
| 136 |
+
|
| 137 |
+
@router.get("/company/{id}")
|
| 138 |
+
def get_company(id: int, db: Session = Depends(get_db)):
|
| 139 |
+
return db.query(Company).filter(Company.id == id).first()
|
| 140 |
+
|
| 141 |
+
@router.delete("/companies/all")
|
| 142 |
+
def delete_all_companies(db: Session = Depends(get_db)):
|
| 143 |
+
"""Delete all companies from the database"""
|
| 144 |
+
count = db.query(Company).delete()
|
| 145 |
+
db.commit()
|
| 146 |
+
return {"message": f"Deleted {count} companies"}
|
| 147 |
+
|
| 148 |
+
@router.delete("/company/{id}")
|
| 149 |
+
def delete_company(id: int, db: Session = Depends(get_db)):
|
| 150 |
+
"""Delete a specific company by ID"""
|
| 151 |
+
company = db.query(Company).filter(Company.id == id).first()
|
| 152 |
+
if not company:
|
| 153 |
+
raise HTTPException(status_code=404, detail="Company not found")
|
| 154 |
+
|
| 155 |
+
db.delete(company)
|
| 156 |
+
db.commit()
|
| 157 |
+
return {"message": f"Deleted company {company.name}"}
|
| 158 |
+
|
| 159 |
+
@router.delete("/requests/cleanup")
|
| 160 |
+
def cleanup_requests(db: Session = Depends(get_db)):
|
| 161 |
+
"""Delete requests that are completed, rejected, or failed"""
|
| 162 |
+
count = db.query(AnalysisRequest).filter(
|
| 163 |
+
AnalysisRequest.status.in_(["completed", "rejected", "failed"])
|
| 164 |
+
).delete(synchronize_session=False)
|
| 165 |
+
db.commit()
|
| 166 |
+
return {"message": f"Cleaned up {count} processed requests"}
|
| 167 |
+
|
| 168 |
+
@router.delete("/request/{id}")
|
| 169 |
+
def delete_request(id: int, db: Session = Depends(get_db)):
|
| 170 |
+
"""Force delete a request"""
|
| 171 |
+
req = db.query(AnalysisRequest).filter(AnalysisRequest.id == id).first()
|
| 172 |
+
if not req:
|
| 173 |
+
raise HTTPException(status_code=404, detail="Request not found")
|
| 174 |
+
db.delete(req)
|
| 175 |
+
db.commit()
|
| 176 |
+
return {"message": "Request deleted"}
|
| 177 |
+
|
| 178 |
+
@router.post("/companies/upload-csv")
|
| 179 |
+
async def upload_companies_csv(file: UploadFile = File(...), db: Session = Depends(get_db)):
|
| 180 |
+
"""
|
| 181 |
+
Upload CSV for live greenwashing analysis with BATCH AI processing.
|
| 182 |
+
"""
|
| 183 |
+
if not file.filename.endswith('.csv'):
|
| 184 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Please upload a CSV.")
|
| 185 |
+
|
| 186 |
+
content = await file.read()
|
| 187 |
+
decoded = content.decode('utf-8-sig')
|
| 188 |
+
csv_reader = csv.DictReader(io.StringIO(decoded))
|
| 189 |
+
|
| 190 |
+
if csv_reader.fieldnames:
|
| 191 |
+
csv_reader.fieldnames = [f.strip().lower() for f in csv_reader.fieldnames]
|
| 192 |
+
|
| 193 |
+
print(f"[DEBUG] CSV Headers found: {csv_reader.fieldnames}")
|
| 194 |
+
|
| 195 |
+
results = []
|
| 196 |
+
gemini_batch = []
|
| 197 |
+
batch_size = 10
|
| 198 |
+
|
| 199 |
+
from app.services.perplexity_client import research_company, PERPLEXITY_API_KEY
|
| 200 |
+
from app.services.llm_generator import generate_batch_insights
|
| 201 |
+
|
| 202 |
+
# Import scoring utilities if not already imported (better to move to top, but here for context)
|
| 203 |
+
from app.services.scoring import analyze_sentiment, calculate_vague_score, calculate_concrete_score
|
| 204 |
+
import re
|
| 205 |
+
|
| 206 |
+
# Helper for counting keywords
|
| 207 |
+
def count_keywords(text: str, keywords: list) -> int:
|
| 208 |
+
count = 0
|
| 209 |
+
text_lower = text.lower()
|
| 210 |
+
for k in keywords:
|
| 211 |
+
count += len(re.findall(r'\b' + re.escape(k) + r'\b', text_lower))
|
| 212 |
+
return count
|
| 213 |
+
|
| 214 |
+
# Keyword lists (reused from analysis_engine concept)
|
| 215 |
+
GREEN_KEYWORDS = ['sustainable', 'eco-friendly', 'green', 'carbon neutral', 'net zero', 'renewable', 'biodegradable']
|
| 216 |
+
EMISSION_KEYWORDS = ['emission', 'co2', 'carbon']
|
| 217 |
+
ENERGY_KEYWORDS = ['energy', 'solar', 'wind', 'power']
|
| 218 |
+
WASTE_KEYWORDS = ['waste', 'recycling', 'plastic']
|
| 219 |
+
|
| 220 |
+
gemini_batch = []
|
| 221 |
+
batch_size = 10
|
| 222 |
+
|
| 223 |
+
def process_batch_and_save(batch_items):
|
| 224 |
+
if not batch_items: return
|
| 225 |
+
|
| 226 |
+
# Split batch into AI-needed and Fast-Path
|
| 227 |
+
ai_needed_items = [item for item in batch_items if not item.get('skip_ai')]
|
| 228 |
+
fast_path_items = [item for item in batch_items if item.get('skip_ai')]
|
| 229 |
+
|
| 230 |
+
batch_insights = {}
|
| 231 |
+
|
| 232 |
+
# 1. Generate AI Insights ONLY for needed items
|
| 233 |
+
if ai_needed_items:
|
| 234 |
+
ai_inputs = [{"name": item['name'], "context": item['context']} for item in ai_needed_items]
|
| 235 |
+
print(f"Processing batch of {len(ai_inputs)} companies via AI Service...")
|
| 236 |
+
|
| 237 |
+
# Add small delay only if calling AI
|
| 238 |
+
if len(ai_inputs) > 0:
|
| 239 |
+
time.sleep(2)
|
| 240 |
+
|
| 241 |
+
batch_insights = generate_batch_insights(ai_inputs)
|
| 242 |
+
|
| 243 |
+
# 2. Merge and Save (Process both lists)
|
| 244 |
+
for item in batch_items:
|
| 245 |
+
name = item['name']
|
| 246 |
+
|
| 247 |
+
if item.get('skip_ai'):
|
| 248 |
+
# Fast Path Defaults
|
| 249 |
+
desc = item.get('text')[:500] if item.get('text') else "Imported via CSV (Manual Assessment)"
|
| 250 |
+
recs = ["Maintain current transparency"] if item['gw_label'] == 0 else ["Improve data disclosure"]
|
| 251 |
+
else:
|
| 252 |
+
# AI Results
|
| 253 |
+
insights = batch_insights.get(name, {})
|
| 254 |
+
desc = insights.get("description", "AI description pending or unavailable.")
|
| 255 |
+
recs = insights.get("recommendations", {})
|
| 256 |
+
|
| 257 |
+
# Construct Final Result
|
| 258 |
+
analysis_result = {
|
| 259 |
+
"company_name": name,
|
| 260 |
+
"company_description": desc,
|
| 261 |
+
"last_updated": datetime.now().isoformat(),
|
| 262 |
+
"confidence_score": f"{item['prediction']['details'].get('confidence', 'N/A')}% (AI)" if not item.get('skip_ai') else "100% (Manual)",
|
| 263 |
+
"greenwashingLabel": item['gw_label'],
|
| 264 |
+
"internal_documents_analysis": {
|
| 265 |
+
"major_findings": [
|
| 266 |
+
f"Risk Level: {item['final_label_str']}",
|
| 267 |
+
f"Reason: {item['reasoning_text']}"
|
| 268 |
+
],
|
| 269 |
+
"compliance_risks": [item['reasoning_text']] if item['gw_label'] == 1 else []
|
| 270 |
+
},
|
| 271 |
+
"reviews_analysis": {
|
| 272 |
+
"employee_tone": "N/A",
|
| 273 |
+
"customer_tone": "N/A",
|
| 274 |
+
"common_issues": [],
|
| 275 |
+
"overall_sentiment_score": f"{int(item['features_dict']['overall_sentiment_score'] * 100)}/100"
|
| 276 |
+
},
|
| 277 |
+
"recommended_actions": recs,
|
| 278 |
+
"external_summary": {
|
| 279 |
+
"key_highlights": [f"External Sentiment Gap: {item['features_dict']['external_sentiment_gap']}"],
|
| 280 |
+
"public_sentiment": "Mixed" if item['features_dict']['external_sentiment_gap'] > 0.1 else "Positive",
|
| 281 |
+
"recent_news_summary": item['reasoning_text'],
|
| 282 |
+
"possible_bias": "None",
|
| 283 |
+
},
|
| 284 |
+
"risk_assessment": {
|
| 285 |
+
"financial_risk": "High" if item['final_label_str'] == "Greenwashing" else "Low",
|
| 286 |
+
"reputation_risk": "Critical" if item['final_label_str'] == "Greenwashing" else ("Medium" if item['final_label_str'] == "At Risk" else "Low"),
|
| 287 |
+
"compliance_risk": "Medium",
|
| 288 |
+
"market_risk": "Low",
|
| 289 |
+
"overall_risk_level": item['final_label_str']
|
| 290 |
+
},
|
| 291 |
+
"final_company_score": {
|
| 292 |
+
"rating_out_of_100": int(item['features_dict']['overall_sentiment_score'] * 100) if item['features_dict']['overall_sentiment_score'] <= 1 else int(item['features_dict']['overall_sentiment_score']),
|
| 293 |
+
"label": item['prediction']['model_label']
|
| 294 |
+
},
|
| 295 |
+
"detailed_scores": item['features_dict'],
|
| 296 |
+
"generated_summary": f"Classified as {item['prediction']['model_label']}"
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
results.append({"name": name, "label": item['gw_label'], "status": f"Processed ({item['final_label_str']})"})
|
| 300 |
+
|
| 301 |
+
# DB Save
|
| 302 |
+
existing = db.query(Company).filter(Company.name == name).first()
|
| 303 |
+
if existing:
|
| 304 |
+
existing.analysis_result = analysis_result
|
| 305 |
+
existing.last_analysis_date = datetime.now()
|
| 306 |
+
else:
|
| 307 |
+
new_company = Company(
|
| 308 |
+
name=name,
|
| 309 |
+
description=desc,
|
| 310 |
+
analysis_result=analysis_result,
|
| 311 |
+
last_analysis_date=datetime.now()
|
| 312 |
+
)
|
| 313 |
+
db.add(new_company)
|
| 314 |
+
db.commit()
|
| 315 |
+
|
| 316 |
+
for row in csv_reader:
|
| 317 |
+
# Flexible column names (normalized)
|
| 318 |
+
name = row.get('company_name') or row.get('company') or row.get('name')
|
| 319 |
+
text = row.get('description') or row.get('text') or row.get('claims') or ""
|
| 320 |
+
|
| 321 |
+
if not name:
|
| 322 |
+
continue
|
| 323 |
+
|
| 324 |
+
# --- FEATURE CALCULATION (If columns missing) ---
|
| 325 |
+
# 1. Base Sentiment
|
| 326 |
+
sentiment_res = analyze_sentiment([text] if text else [])
|
| 327 |
+
overall_sentiment = sentiment_res['score']
|
| 328 |
+
|
| 329 |
+
# 2. Keyword Stats
|
| 330 |
+
green_freq = float(row.get('green keyword frequecy') or row.get('green keyword frequency') or count_keywords(text, GREEN_KEYWORDS))
|
| 331 |
+
|
| 332 |
+
# 3. Vague/Concrete Scores (Using simple heuristic or scoring func)
|
| 333 |
+
# Assuming scoring.py has these, if not, fallback to simple version:
|
| 334 |
+
try:
|
| 335 |
+
# Basic sentence splitting
|
| 336 |
+
sentences = [s.strip() for s in text.split('.') if s.strip()]
|
| 337 |
+
vague_ratio = float(row.get('vague keyword ratio') or calculate_vague_score(sentences))
|
| 338 |
+
concrete_ratio = float(row.get('concrete cailm ratio') or row.get('concrete claim ratio') or calculate_concrete_score(sentences))
|
| 339 |
+
except:
|
| 340 |
+
vague_ratio = 0.2
|
| 341 |
+
concrete_ratio = 0.3
|
| 342 |
+
|
| 343 |
+
# 4. Aspect Sentiments (Fallback to overall if specific not found)
|
| 344 |
+
emission_sent = float(row.get('emission sentiment ') or row.get('emission sentiment') or overall_sentiment)
|
| 345 |
+
energy_sent = float(row.get('energy sentiment') or overall_sentiment)
|
| 346 |
+
waste_sent = float(row.get('waste sentiment') or overall_sentiment)
|
| 347 |
+
|
| 348 |
+
# EXTRACT FEATURES FOR MODEL (AND FRONTEND DISPLAY)
|
| 349 |
+
# Naming Verification:
|
| 350 |
+
# Frontend (Analytics.tsx) expects:
|
| 351 |
+
# - green_keyword_frequency
|
| 352 |
+
# - vague_keyword_ratio
|
| 353 |
+
# - concrete_claim_ratio
|
| 354 |
+
# - external_sentiment_gap
|
| 355 |
+
# - emission_sentiment
|
| 356 |
+
# - energy_sentiment
|
| 357 |
+
# - waste_sentiment
|
| 358 |
+
# - relative_focus_score
|
| 359 |
+
|
| 360 |
+
features_dict = {
|
| 361 |
+
'green_keyword_frequency': green_freq,
|
| 362 |
+
'vague_keyword_ratio': vague_ratio,
|
| 363 |
+
'concrete_claim_ratio': concrete_ratio,
|
| 364 |
+
'overall_sentiment_score': overall_sentiment,
|
| 365 |
+
'external_sentiment_gap': float(row.get('external_sentiment_gap') or 0.4),
|
| 366 |
+
'emission_sentiment': emission_sent,
|
| 367 |
+
'energy_sentiment': energy_sent,
|
| 368 |
+
'waste_sentiment': waste_sent,
|
| 369 |
+
'relative_focus_score': float(row.get('relative focus score') or 0.5)
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
gw_label_raw = row.get('greenwashing_label') or row.get('greenwashing label') or row.get('category')
|
| 373 |
+
skip_ai = False
|
| 374 |
+
|
| 375 |
+
if gw_label_raw:
|
| 376 |
+
# Manual label from CSV - TRUST IT (No AI)
|
| 377 |
+
skip_ai = True
|
| 378 |
+
final_label_str = str(gw_label_raw).strip()
|
| 379 |
+
if final_label_str.lower() in ['greenwashing', 'high', 'critical', '1']:
|
| 380 |
+
final_label_str = "Greenwashing"; gw_label = 1
|
| 381 |
+
elif final_label_str.lower() in ['medium', 'at risk']:
|
| 382 |
+
final_label_str = "At Risk"; gw_label = 1
|
| 383 |
+
else:
|
| 384 |
+
final_label_str = "No Risk"; gw_label = 0
|
| 385 |
+
|
| 386 |
+
reasoning_text = f"Classified as {final_label_str} based on historical CSV data."
|
| 387 |
+
|
| 388 |
+
# Initialize dummy prediction for compatibility
|
| 389 |
+
prediction = {
|
| 390 |
+
'risk_label': final_label_str,
|
| 391 |
+
'greenwashing_risk': gw_label,
|
| 392 |
+
'details': {'confidence': 100},
|
| 393 |
+
'model_label': final_label_str
|
| 394 |
+
}
|
| 395 |
+
else:
|
| 396 |
+
# AI/Model Prediction (Fallback only if no label)
|
| 397 |
+
prediction = predict_greenwashing_risk(text, company_name=name, features_dict=features_dict)
|
| 398 |
+
|
| 399 |
+
final_label_str = prediction['risk_label']
|
| 400 |
+
# Map old AI outputs to new strings just in case
|
| 401 |
+
if final_label_str == "High" or final_label_str == "Critical": final_label_str = "Greenwashing"
|
| 402 |
+
elif final_label_str == "Medium": final_label_str = "At Risk"
|
| 403 |
+
elif final_label_str == "Low": final_label_str = "No Risk"
|
| 404 |
+
|
| 405 |
+
gw_label = 1 if final_label_str in ["Greenwashing", "At Risk"] else 0
|
| 406 |
+
reasoning_text = f"AI Analysis: Classified as {final_label_str} based on pattern matching."
|
| 407 |
+
|
| 408 |
+
# --- HEURISTIC OVERRIDE (Forcing Sensitivity) ---
|
| 409 |
+
# If Vague > 0.50 AND not enough concrete data to justify it (>10%)
|
| 410 |
+
if vague_ratio > 0.50 and concrete_ratio < 0.10:
|
| 411 |
+
final_label_str = "Greenwashing"
|
| 412 |
+
gw_label = 1
|
| 413 |
+
reasoning_text = "Risk High: Excessive vague language without supporting concrete data."
|
| 414 |
+
elif concrete_ratio < 0.01 and overall_sentiment > 0.6:
|
| 415 |
+
final_label_str = "Greenwashing"
|
| 416 |
+
gw_label = 1
|
| 417 |
+
reasoning_text = "Greenwashing Alert: Positive claims lack concrete evidence."
|
| 418 |
+
|
| 419 |
+
# PERPLEXITY CHECK (Instant Processing for Paid API)
|
| 420 |
+
pplx_success = False
|
| 421 |
+
if PERPLEXITY_API_KEY and not skip_ai:
|
| 422 |
+
pplx_data = research_company(name)
|
| 423 |
+
if pplx_data:
|
| 424 |
+
pplx_success = True
|
| 425 |
+
# If Perplexity worked, save immediately and skip batch
|
| 426 |
+
# Construct partial item to reuse logic or save directly?
|
| 427 |
+
# Saving directly is safer to avoid mixups.
|
| 428 |
+
desc = pplx_data.get("description", "AI unavailable")
|
| 429 |
+
recs = pplx_data.get("recommendations", {})
|
| 430 |
+
if "Controversy" in str(pplx_data.get("findings")): gw_label = 1 # Update risk
|
| 431 |
+
|
| 432 |
+
# ... (Reuse Construction Logic?) ...
|
| 433 |
+
# For brevity, I will just add it to a "processed_item" and call save single?
|
| 434 |
+
# Actually, let's just make a fake batch of 1 and reuse the save logic but pass pre-filled data?
|
| 435 |
+
# Complexity: High.
|
| 436 |
+
|
| 437 |
+
# Simplification: Treat Perplexity result as "batch insights" result for a batch of 1.
|
| 438 |
+
# Mock batch_insights structure
|
| 439 |
+
# Call save logic manually or refactor `process_batch_and_save` to accept external insights?
|
| 440 |
+
|
| 441 |
+
# Plan: Construct `item` manually, adding 'pplx_insights' key. Update `process_batch` to check for it.
|
| 442 |
+
pass
|
| 443 |
+
|
| 444 |
+
# Prepare Context
|
| 445 |
+
context = f"""
|
| 446 |
+
Greenwashing Risk: {final_label_str}
|
| 447 |
+
Reason: {reasoning_text}
|
| 448 |
+
Sentiment: {features_dict['overall_sentiment_score']:.2f}
|
| 449 |
+
"""
|
| 450 |
+
|
| 451 |
+
item_data = {
|
| 452 |
+
"name": name,
|
| 453 |
+
"text": text,
|
| 454 |
+
"context": context,
|
| 455 |
+
"prediction": prediction,
|
| 456 |
+
"features_dict": features_dict,
|
| 457 |
+
"gw_label": gw_label,
|
| 458 |
+
"final_label_str": final_label_str,
|
| 459 |
+
"reasoning_text": reasoning_text,
|
| 460 |
+
"skip_ai": skip_ai
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
# Queue for Batch
|
| 464 |
+
gemini_batch.append(item_data)
|
| 465 |
+
|
| 466 |
+
if len(gemini_batch) >= batch_size:
|
| 467 |
+
process_batch_and_save(gemini_batch)
|
| 468 |
+
gemini_batch = []
|
| 469 |
+
|
| 470 |
+
# Final batch
|
| 471 |
+
if gemini_batch:
|
| 472 |
+
process_batch_and_save(gemini_batch)
|
| 473 |
+
|
| 474 |
+
return {
|
| 475 |
+
"message": f"Processed {len(results)} companies using Batch AI Analysis.",
|
| 476 |
+
"predictions": results
|
| 477 |
+
}
|
app/db/__pycache__/models.cpython-310.pyc
ADDED
|
Binary file (1.48 kB). View file
|
|
|
app/db/__pycache__/models.cpython-311.pyc
ADDED
|
Binary file (2.47 kB). View file
|
|
|
app/db/__pycache__/session.cpython-310.pyc
ADDED
|
Binary file (639 Bytes). View file
|
|
|
app/db/__pycache__/session.cpython-311.pyc
ADDED
|
Binary file (1.03 kB). View file
|
|
|
app/db/models.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import Column, Integer, String, Text, JSON, DateTime, ForeignKey, Float
|
| 2 |
+
from sqlalchemy.orm import relationship
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from .session import Base
|
| 5 |
+
|
| 6 |
+
class Company(Base):
|
| 7 |
+
__tablename__ = "companies"
|
| 8 |
+
|
| 9 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 10 |
+
name = Column(String, unique=True, index=True)
|
| 11 |
+
description = Column(Text, nullable=True)
|
| 12 |
+
website = Column(String, nullable=True)
|
| 13 |
+
last_analysis_date = Column(DateTime, default=datetime.utcnow)
|
| 14 |
+
|
| 15 |
+
# JSON blobs for structured analysis data
|
| 16 |
+
analysis_result = Column(JSON, nullable=True)
|
| 17 |
+
|
| 18 |
+
requests = relationship("AnalysisRequest", back_populates="company")
|
| 19 |
+
|
| 20 |
+
class AnalysisRequest(Base):
|
| 21 |
+
__tablename__ = "requests"
|
| 22 |
+
|
| 23 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 24 |
+
user_id = Column(String, index=True) # Linking to frontend user ID
|
| 25 |
+
company_name = Column(String)
|
| 26 |
+
website = Column(String, nullable=True)
|
| 27 |
+
document_name = Column(String, nullable=True)
|
| 28 |
+
document_content = Column(Text, nullable=True) # Extracted text from PDF
|
| 29 |
+
|
| 30 |
+
status = Column(String, default="pending") # pending, processing, completed, failed
|
| 31 |
+
submission_date = Column(DateTime, default=datetime.utcnow)
|
| 32 |
+
|
| 33 |
+
analysis_result = Column(JSON, nullable=True)
|
| 34 |
+
rejection_reason = Column(String, nullable=True)
|
| 35 |
+
|
| 36 |
+
company_id = Column(Integer, ForeignKey("companies.id"), nullable=True)
|
| 37 |
+
company = relationship("Company", back_populates="requests")
|
app/db/session.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import create_engine
|
| 2 |
+
from sqlalchemy.ext.declarative import declarative_base
|
| 3 |
+
from sqlalchemy.orm import sessionmaker
|
| 4 |
+
|
| 5 |
+
SQLALCHEMY_DATABASE_URL = "sqlite:///./greenintellect.db"
|
| 6 |
+
# SQLALCHEMY_DATABASE_URL = "postgresql://user:password@postgresserver/db"
|
| 7 |
+
|
| 8 |
+
engine = create_engine(
|
| 9 |
+
SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
|
| 10 |
+
)
|
| 11 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 12 |
+
|
| 13 |
+
Base = declarative_base()
|
| 14 |
+
|
| 15 |
+
def get_db():
|
| 16 |
+
db = SessionLocal()
|
| 17 |
+
try:
|
| 18 |
+
yield db
|
| 19 |
+
finally:
|
| 20 |
+
db.close()
|
app/main.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from .api import endpoints
|
| 5 |
+
from .db.session import engine, Base
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
# Create Tables
|
| 10 |
+
|
| 11 |
+
# Create Tables
|
| 12 |
+
Base.metadata.create_all(bind=engine)
|
| 13 |
+
|
| 14 |
+
app = FastAPI(title="Green Intellect API", version="1.0.0")
|
| 15 |
+
|
| 16 |
+
# CORS
|
| 17 |
+
app.add_middleware(
|
| 18 |
+
CORSMiddleware,
|
| 19 |
+
allow_origins=["*"],
|
| 20 |
+
allow_credentials=True,
|
| 21 |
+
allow_methods=["*"],
|
| 22 |
+
allow_headers=["*"],
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
app.include_router(endpoints.router, prefix="/api")
|
| 26 |
+
|
| 27 |
+
@app.get("/")
|
| 28 |
+
def read_root():
|
| 29 |
+
return {"message": "Welcome to Green Intellect API"}
|
| 30 |
+
|
| 31 |
+
if __name__ == "__main__":
|
| 32 |
+
import uvicorn
|
| 33 |
+
uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
|
app/services/__pycache__/analysis_engine.cpython-310.pyc
ADDED
|
Binary file (7.91 kB). View file
|
|
|
app/services/__pycache__/analysis_engine.cpython-311.pyc
ADDED
|
Binary file (19.6 kB). View file
|
|
|
app/services/__pycache__/hugchat_client.cpython-311.pyc
ADDED
|
Binary file (2.36 kB). View file
|
|
|
app/services/__pycache__/llm_generator.cpython-311.pyc
ADDED
|
Binary file (11.3 kB). View file
|
|
|
app/services/__pycache__/ml_logic.cpython-311.pyc
ADDED
|
Binary file (6.12 kB). View file
|
|
|
app/services/__pycache__/ml_models.cpython-310.pyc
ADDED
|
Binary file (1.01 kB). View file
|
|
|
app/services/__pycache__/ml_models.cpython-311.pyc
ADDED
|
Binary file (1.8 kB). View file
|
|
|
app/services/__pycache__/pdf_processor.cpython-310.pyc
ADDED
|
Binary file (887 Bytes). View file
|
|
|
app/services/__pycache__/pdf_processor.cpython-311.pyc
ADDED
|
Binary file (1.62 kB). View file
|
|
|
app/services/__pycache__/perplexity_client.cpython-311.pyc
ADDED
|
Binary file (2.87 kB). View file
|
|
|
app/services/__pycache__/scoring.cpython-310.pyc
ADDED
|
Binary file (3.67 kB). View file
|
|
|
app/services/__pycache__/scoring.cpython-311.pyc
ADDED
|
Binary file (7.65 kB). View file
|
|
|
app/services/__pycache__/scraper.cpython-310.pyc
ADDED
|
Binary file (4.39 kB). View file
|
|
|
app/services/__pycache__/scraper.cpython-311.pyc
ADDED
|
Binary file (18.4 kB). View file
|
|
|
app/services/analysis_engine.py
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
from .pdf_processor import extract_text_from_pdf, split_sentences, clean_text
|
| 3 |
+
from .scraper import get_company_news, get_company_reviews, report_progress
|
| 4 |
+
from .scoring import calculate_scores, analyze_sentiment, analyze_aspect_sentiment, calculate_vague_score, calculate_concrete_score
|
| 5 |
+
from .llm_generator import generate_company_description, generate_ai_recommendations
|
| 6 |
+
|
| 7 |
+
# Aspect Keywords
|
| 8 |
+
EMISSION_KEYWORDS = ['emission', 'carbon', 'co2', 'greenhouse', 'pollution', 'net zero', 'carbon neutral']
|
| 9 |
+
ENERGY_KEYWORDS = ['energy', 'renewable', 'solar', 'wind', 'electricity', 'fuel', 'power']
|
| 10 |
+
WASTE_KEYWORDS = ['waste', 'recycling', 'plastic', 'circular economy', 'disposal', 'landfill']
|
| 11 |
+
|
| 12 |
+
def detect_contradictions(pdf_text, news_articles):
|
| 13 |
+
"""
|
| 14 |
+
Detect contradictions between company claims (PDF) and external reports (news)
|
| 15 |
+
Returns list of contradictions with evidence
|
| 16 |
+
"""
|
| 17 |
+
contradictions = []
|
| 18 |
+
|
| 19 |
+
# Keywords that indicate strong claims
|
| 20 |
+
claim_keywords = ['committed', 'achieved', 'reduced', 'eliminated', 'carbon neutral', 'net zero', 'sustainable']
|
| 21 |
+
|
| 22 |
+
# Keywords that indicate environmental context (Strict Physical Terms only)
|
| 23 |
+
# Removed generic words like 'green', 'sustainability', 'environmental' which appear in financial contexts
|
| 24 |
+
env_context = ['climate', 'carbon', 'emission', 'pollution', 'waste', 'biodiversity', 'fossil fuel', 'deforestation', 'ecological']
|
| 25 |
+
|
| 26 |
+
# Exclude regulators to avoid flagging financial fines as greenwashing
|
| 27 |
+
# (RBI, SEBI, SEC, etc.)
|
| 28 |
+
financial_exclusions = ['rbi', 'sebi', 'sec', 'money laundering', 'insider trading', 'stock market', 'shares', 'quarterly result']
|
| 29 |
+
|
| 30 |
+
for article in news_articles:
|
| 31 |
+
# Check if article is relevant to environment before counting it as a contradiction
|
| 32 |
+
text = (article['title'] + " " + article['content']).lower()
|
| 33 |
+
|
| 34 |
+
# Safety Check: If it mentions financial regulators/crimes, IGNORE even if it says "Green"
|
| 35 |
+
if any(ex in text for ex in financial_exclusions):
|
| 36 |
+
continue
|
| 37 |
+
|
| 38 |
+
if not any(k in text for k in env_context):
|
| 39 |
+
continue
|
| 40 |
+
|
| 41 |
+
for key in claim_keywords:
|
| 42 |
+
if key in text and any(neg in text for neg in ['false', 'misleading', 'investigation', 'lawsuit', 'fine', 'violation']):
|
| 43 |
+
contradictions.append({
|
| 44 |
+
"claim_type": "Environmental claim questioned",
|
| 45 |
+
"evidence": article['title'],
|
| 46 |
+
"source": article['url'],
|
| 47 |
+
"risk_level": "High"
|
| 48 |
+
})
|
| 49 |
+
break
|
| 50 |
+
|
| 51 |
+
# Keywords that indicate skepticism or allegations
|
| 52 |
+
skeptic_keywords = ['greenwashing', 'false claims', 'misleading', 'controversy', 'lawsuit', 'allegations']
|
| 53 |
+
|
| 54 |
+
pdf_lower = pdf_text.lower()
|
| 55 |
+
has_strong_claims = any(keyword in pdf_lower for keyword in claim_keywords)
|
| 56 |
+
|
| 57 |
+
if has_strong_claims:
|
| 58 |
+
for article in news_articles:
|
| 59 |
+
content_lower = article['content'].lower()
|
| 60 |
+
if any(keyword in content_lower for keyword in skeptic_keywords):
|
| 61 |
+
contradictions.append({
|
| 62 |
+
"claim_type": "Environmental commitment",
|
| 63 |
+
"evidence_url": article['url'],
|
| 64 |
+
"evidence_title": article['title'],
|
| 65 |
+
"severity": "High"
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
# New: General Compliance Risk Detection (Not just contradictions)
|
| 69 |
+
# Search for specific legal/compliance keywords in all articles
|
| 70 |
+
compliance_keywords = ['lawsuit', 'fine', 'penalty', 'violation', 'non-compliance', 'EPA', 'investigation', 'fraud', 'illegal']
|
| 71 |
+
for article in news_articles:
|
| 72 |
+
content_lower = article['content'].lower()
|
| 73 |
+
if any(keyword in content_lower for keyword in compliance_keywords):
|
| 74 |
+
contradictions.append({ # Leveraging the same list for now, or could create a separate list
|
| 75 |
+
"claim_type": "Regulatory Compliance Issue",
|
| 76 |
+
"evidence_url": article['url'],
|
| 77 |
+
"evidence_title": article['title'],
|
| 78 |
+
"severity": "Critical"
|
| 79 |
+
})
|
| 80 |
+
|
| 81 |
+
return contradictions
|
| 82 |
+
|
| 83 |
+
def detect_hidden_patterns(all_reviews):
|
| 84 |
+
"""
|
| 85 |
+
Analyze reviews to find hidden patterns:
|
| 86 |
+
- Sudden changes in sentiment
|
| 87 |
+
- Repeated phrases (astroturfing)
|
| 88 |
+
- Discrepancies between employee and customer reviews
|
| 89 |
+
"""
|
| 90 |
+
patterns = []
|
| 91 |
+
|
| 92 |
+
if len(all_reviews) > 10:
|
| 93 |
+
# Check for repeated phrases (potential fake reviews)
|
| 94 |
+
content_texts = [r['content'][:500] for r in all_reviews]
|
| 95 |
+
unique_ratio = len(set(content_texts)) / len(content_texts)
|
| 96 |
+
|
| 97 |
+
if unique_ratio < 0.7:
|
| 98 |
+
patterns.append({
|
| 99 |
+
"pattern": "Potential astroturfing detected",
|
| 100 |
+
"description": f"Only {int(unique_ratio*100)}% unique review content - may indicate coordinated posting",
|
| 101 |
+
"severity": "Medium"
|
| 102 |
+
})
|
| 103 |
+
|
| 104 |
+
# Check for platform discrepancies
|
| 105 |
+
glassdoor_reviews = [r for r in all_reviews if 'glassdoor' in r['url'].lower()]
|
| 106 |
+
reddit_reviews = [r for r in all_reviews if 'reddit' in r['url'].lower()]
|
| 107 |
+
|
| 108 |
+
if glassdoor_reviews and reddit_reviews:
|
| 109 |
+
patterns.append({
|
| 110 |
+
"pattern": "Multi-platform analysis available",
|
| 111 |
+
"description": f"Found {len(glassdoor_reviews)} Glassdoor and {len(reddit_reviews)} Reddit discussions for cross-validation",
|
| 112 |
+
"severity": "Info"
|
| 113 |
+
})
|
| 114 |
+
|
| 115 |
+
return patterns
|
| 116 |
+
|
| 117 |
+
async def analyze_company(company_name: str, pdf_path: str):
|
| 118 |
+
report_progress(f"Starting comprehensive analysis for {company_name}", 5)
|
| 119 |
+
|
| 120 |
+
# 1. Process PDF
|
| 121 |
+
report_progress("Processing PDF document...", 8)
|
| 122 |
+
pdf_text = extract_text_from_pdf(pdf_path)
|
| 123 |
+
pdf_sentences = split_sentences(pdf_text)
|
| 124 |
+
|
| 125 |
+
# --- PERPLEXITY AI INTEGRATION ---
|
| 126 |
+
from .perplexity_client import research_company, PERPLEXITY_API_KEY
|
| 127 |
+
pplx_data = None
|
| 128 |
+
|
| 129 |
+
if PERPLEXITY_API_KEY:
|
| 130 |
+
report_progress("Conducting deep research...", 15)
|
| 131 |
+
pplx_data = research_company(company_name)
|
| 132 |
+
|
| 133 |
+
# 2. Comprehensive Scraping (ALL available sources)
|
| 134 |
+
# Always run scraping to get real news, even if Perplexity is active
|
| 135 |
+
news_articles = await get_company_news(company_name)
|
| 136 |
+
|
| 137 |
+
# Optional: We can still use Perplexity findings for internal scoring without displaying them as 'news'
|
| 138 |
+
if pplx_data:
|
| 139 |
+
pass # Findings already in pplx_data for later use
|
| 140 |
+
|
| 141 |
+
# Progress 50-80% handled by get_company_reviews
|
| 142 |
+
reviews = await get_company_reviews(company_name)
|
| 143 |
+
|
| 144 |
+
# Progress 50-80% handled by get_company_reviews
|
| 145 |
+
reviews = await get_company_reviews(company_name)
|
| 146 |
+
|
| 147 |
+
# 3. Analyze PDF Content
|
| 148 |
+
report_progress("Analyzing PDF content...", 82)
|
| 149 |
+
pdf_scores = calculate_scores(pdf_sentences)
|
| 150 |
+
|
| 151 |
+
# 4. Detect Contradictions and Hidden Patterns
|
| 152 |
+
report_progress("Detecting contradictions and patterns...", 85)
|
| 153 |
+
contradictions = detect_contradictions(pdf_text, news_articles)
|
| 154 |
+
hidden_patterns = detect_hidden_patterns(reviews)
|
| 155 |
+
|
| 156 |
+
# 5. Analyze External Sentiment with ALL data
|
| 157 |
+
report_progress("Analyzing sentiment...", 90)
|
| 158 |
+
news_text = [a['content'] for a in news_articles]
|
| 159 |
+
reviews_text = [r['content'] for r in reviews]
|
| 160 |
+
all_external_text = news_text + reviews_text
|
| 161 |
+
|
| 162 |
+
news_sentiment = analyze_sentiment(news_text) if news_text else {'label': 'Neutral', 'score': 0.5}
|
| 163 |
+
reviews_sentiment = analyze_sentiment(reviews_text) if reviews_text else {'label': 'Neutral', 'score': 0.5}
|
| 164 |
+
|
| 165 |
+
# Aspect-based sentiment (REAL SCORES)
|
| 166 |
+
emission_sentiment = analyze_aspect_sentiment(all_external_text, EMISSION_KEYWORDS)
|
| 167 |
+
energy_sentiment = analyze_aspect_sentiment(all_external_text, ENERGY_KEYWORDS)
|
| 168 |
+
waste_sentiment = analyze_aspect_sentiment(all_external_text, WASTE_KEYWORDS)
|
| 169 |
+
|
| 170 |
+
# 6. Calculate Evidence-Based Score with detailed metrics
|
| 171 |
+
report_progress("Calculating final scores...", 95)
|
| 172 |
+
|
| 173 |
+
# Calculate detailed scores (REAL METRICS)
|
| 174 |
+
green_keyword_freq = pdf_scores['env_count'] / max(len(pdf_sentences), 1)
|
| 175 |
+
vague_ratio = calculate_vague_score(pdf_sentences)
|
| 176 |
+
concrete_ratio = calculate_concrete_score(pdf_sentences)
|
| 177 |
+
|
| 178 |
+
# --- IMPROVED SCORING FORMULA ---
|
| 179 |
+
# We now calculate the composite sentiment FIRST and let it drive the external portion of the score.
|
| 180 |
+
# See lines 340+ for where we normally calculated it. We'll do it here to affect the score.
|
| 181 |
+
|
| 182 |
+
# 1. Internal Sentiment
|
| 183 |
+
internal_sentiment_data = analyze_sentiment(pdf_scores['env_sentences'])
|
| 184 |
+
|
| 185 |
+
def get_linear_score_local(s_dict):
|
| 186 |
+
# Convert label+confidence to 0-100 scale
|
| 187 |
+
if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50) # 50-100
|
| 188 |
+
if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50) # 0-50
|
| 189 |
+
return 50 # Neutral
|
| 190 |
+
|
| 191 |
+
s_int = get_linear_score_local(internal_sentiment_data)
|
| 192 |
+
s_ext = get_linear_score_local(news_sentiment)
|
| 193 |
+
s_rev = get_linear_score_local(reviews_sentiment)
|
| 194 |
+
|
| 195 |
+
# 2. Composite Sentiment Score (0-100)
|
| 196 |
+
# 35% Internal (What they say) + 45% External (News) + 20% Reviews (Employee/Public)
|
| 197 |
+
composite_score_val = (s_int * 0.35) + (s_ext * 0.45) + (s_rev * 0.20)
|
| 198 |
+
|
| 199 |
+
# 3. Base Score Calculation
|
| 200 |
+
# We blend the Composite Sentiment (Qualitative) with Concrete Data (Quantitative)
|
| 201 |
+
|
| 202 |
+
# Start with the Sentiment Score (0-100)
|
| 203 |
+
final_score = composite_score_val
|
| 204 |
+
|
| 205 |
+
# Adjust based on Concrete Data (The "Proof")
|
| 206 |
+
# If they have high concrete data, boost the score.
|
| 207 |
+
# If they have high vague language, penalize the score.
|
| 208 |
+
|
| 209 |
+
score_modifier = 0
|
| 210 |
+
score_modifier += min(concrete_ratio * 100, 25) # Up to +25 points for concrete data
|
| 211 |
+
score_modifier -= min(vague_ratio * 50, 20) # Up to -20 points for vague language
|
| 212 |
+
|
| 213 |
+
# Apply modifier
|
| 214 |
+
final_score += score_modifier
|
| 215 |
+
|
| 216 |
+
# Contradiction Penalty (Facts Check)
|
| 217 |
+
if contradictions:
|
| 218 |
+
# Heavily penalize for contradictions
|
| 219 |
+
final_score -= (len(contradictions) * 15)
|
| 220 |
+
|
| 221 |
+
# Cap at 0-100
|
| 222 |
+
final_score = max(0, min(100, final_score))
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
# Calculate external sentiment gap
|
| 226 |
+
ext_gap = abs(news_sentiment['score'] - reviews_sentiment['score'])
|
| 227 |
+
# Determine label
|
| 228 |
+
if final_score >= 80: label = "Excellent"
|
| 229 |
+
elif final_score >= 60: label = "Good"
|
| 230 |
+
elif final_score >= 40: label = "Average"
|
| 231 |
+
elif final_score >= 20: label = "At Risk"
|
| 232 |
+
else: label = "Greenwashing"
|
| 233 |
+
|
| 234 |
+
# Determine risk level (3-State System)
|
| 235 |
+
# 2 = Greenwashing (High/Critical)
|
| 236 |
+
# 1 = At Risk (Medium)
|
| 237 |
+
# 0 = No Risk (Low)
|
| 238 |
+
risk_level_code = 0
|
| 239 |
+
risk_reasons = []
|
| 240 |
+
|
| 241 |
+
# 1. Contradictions (Immediate Greenwashing)
|
| 242 |
+
if contradictions:
|
| 243 |
+
risk_level_code = 2
|
| 244 |
+
risk_reasons.append("External contradictions found")
|
| 245 |
+
|
| 246 |
+
# 2. Score Thresholds
|
| 247 |
+
if final_score < 40:
|
| 248 |
+
risk_level_code = max(risk_level_code, 2)
|
| 249 |
+
risk_reasons.append(f"Critical Sustainability Score ({int(final_score)}/100)")
|
| 250 |
+
elif final_score < 60:
|
| 251 |
+
risk_level_code = max(risk_level_code, 1) # At Risk
|
| 252 |
+
|
| 253 |
+
# 3. Vague Language
|
| 254 |
+
if vague_ratio > 0.50 and concrete_ratio < 0.10:
|
| 255 |
+
risk_level_code = 2
|
| 256 |
+
risk_reasons.append("Excessive vague language")
|
| 257 |
+
elif vague_ratio > 0.40 and concrete_ratio < 0.20:
|
| 258 |
+
risk_level_code = max(risk_level_code, 1) # At Risk
|
| 259 |
+
|
| 260 |
+
# 4. Empty Claims
|
| 261 |
+
if news_sentiment['label'] == 'Positive' and concrete_ratio < 0.01:
|
| 262 |
+
risk_level_code = 2
|
| 263 |
+
risk_reasons.append("Positive press without concrete data")
|
| 264 |
+
|
| 265 |
+
# --- SAFE HARBOR OVERRIDE ---
|
| 266 |
+
high_risk_industries = ['coal', 'oil', 'petroleum', 'mining', 'gas', 'cement', 'steel', 'tobacco', 'power', 'thermal', 'adani']
|
| 267 |
+
is_high_risk = any(ind in company_name.lower() for ind in high_risk_industries)
|
| 268 |
+
|
| 269 |
+
pass_safe_harbor = False
|
| 270 |
+
if concrete_ratio > 0.05 and len(contradictions) < 2:
|
| 271 |
+
if is_high_risk:
|
| 272 |
+
if concrete_ratio > 0.20 and emission_sentiment['label'] == 'Positive':
|
| 273 |
+
pass_safe_harbor = True
|
| 274 |
+
else:
|
| 275 |
+
if risk_level_code < 2:
|
| 276 |
+
risk_level_code = 2
|
| 277 |
+
risk_reasons.append("High Risk Industry without exceptional mitigation")
|
| 278 |
+
elif emission_sentiment['label'] != 'Negative':
|
| 279 |
+
pass_safe_harbor = True
|
| 280 |
+
|
| 281 |
+
if pass_safe_harbor:
|
| 282 |
+
risk_level_code = 0 # Force No Risk
|
| 283 |
+
if risk_reasons:
|
| 284 |
+
risk_reasons = [f"Risk Mitigated: Sufficient concrete data ({round(concrete_ratio*100, 1)}%) provided."]
|
| 285 |
+
print(f"SAFE HARBOR TRIGGERED for {company_name}")
|
| 286 |
+
|
| 287 |
+
# Map code to string
|
| 288 |
+
# IMPACT: User requested specific labels
|
| 289 |
+
if risk_level_code == 2:
|
| 290 |
+
overall_risk_str = "Greenwashing"
|
| 291 |
+
greenwashing_flag = 1
|
| 292 |
+
elif risk_level_code == 1:
|
| 293 |
+
overall_risk_str = "At Risk"
|
| 294 |
+
greenwashing_flag = 0 # It's not "Greenwashing" per se, just risky? Or should flag be 1?
|
| 295 |
+
# Typically "Greenwashing" flag is binary for UI warnings, but let's keep it 1 only for High.
|
| 296 |
+
else:
|
| 297 |
+
overall_risk_str = "No Risk"
|
| 298 |
+
greenwashing_flag = 0
|
| 299 |
+
|
| 300 |
+
# Update reasons into result
|
| 301 |
+
if risk_reasons and risk_level_code >= 1:
|
| 302 |
+
pdf_scores['env_sentences'] = [f"[RISK] {r}" for r in risk_reasons] + pdf_scores['env_sentences']
|
| 303 |
+
|
| 304 |
+
# --- AI RECOMMENDATIONS & DESCRIPTION GENERATION ---
|
| 305 |
+
company_description = ""
|
| 306 |
+
ai_recommendations = {}
|
| 307 |
+
|
| 308 |
+
if pplx_data:
|
| 309 |
+
report_progress("Using insights...", 95)
|
| 310 |
+
company_description = pplx_data.get("description", "Description unavailable.")
|
| 311 |
+
ai_recommendations = pplx_data.get("recommendations", {})
|
| 312 |
+
else:
|
| 313 |
+
# Fallback to Gemini or defaults
|
| 314 |
+
try:
|
| 315 |
+
from .llm_generator import generate_company_description, generate_ai_recommendations
|
| 316 |
+
report_progress("Generating insights...", 98)
|
| 317 |
+
company_description = generate_company_description(company_name)
|
| 318 |
+
|
| 319 |
+
pre_result = {
|
| 320 |
+
"greenwashingLabel": greenwashing_flag,
|
| 321 |
+
"internal_documents_analysis": {"major_findings": pdf_scores['env_sentences'][:1]},
|
| 322 |
+
"contradictions_detected": contradictions,
|
| 323 |
+
"external_summary": {"public_sentiment": news_sentiment['label']}
|
| 324 |
+
}
|
| 325 |
+
ai_recommendations = generate_ai_recommendations(company_name, pre_result)
|
| 326 |
+
except Exception as e:
|
| 327 |
+
print(f"AI Generation fallback failed: {e}")
|
| 328 |
+
company_description = f"Analysis of {company_name}'s sustainability practices."
|
| 329 |
+
ai_recommendations = {
|
| 330 |
+
"customers": ["Review sustainability claims"],
|
| 331 |
+
"investors": ["Monitor ESG disclosures"],
|
| 332 |
+
"regulators": ["Standard compliance checks"]
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
# --- COMPOSITE SENTIMENT SCORE ---
|
| 336 |
+
# (calculation remains same)
|
| 337 |
+
internal_sentiment = analyze_sentiment(pdf_scores['env_sentences'])
|
| 338 |
+
|
| 339 |
+
def get_linear_score(s_dict):
|
| 340 |
+
if s_dict['label'] == 'Positive': return 50 + (s_dict['score'] * 50)
|
| 341 |
+
if s_dict['label'] == 'Negative': return 50 - (s_dict['score'] * 50)
|
| 342 |
+
return 50 # Neutral
|
| 343 |
+
|
| 344 |
+
int_s = get_linear_score(internal_sentiment)
|
| 345 |
+
ext_s = get_linear_score(news_sentiment)
|
| 346 |
+
rev_s = get_linear_score(reviews_sentiment)
|
| 347 |
+
|
| 348 |
+
composite_score = (int_s * 0.4) + (ext_s * 0.4) + (rev_s * 0.2)
|
| 349 |
+
composite_score_norm = composite_score / 100.0
|
| 350 |
+
|
| 351 |
+
# (AI generation already done above - using company_description and ai_recommendations)
|
| 352 |
+
|
| 353 |
+
# Update result
|
| 354 |
+
result = {
|
| 355 |
+
"company_name": company_name,
|
| 356 |
+
"company_description": company_description,
|
| 357 |
+
"last_updated": datetime.now().isoformat(),
|
| 358 |
+
"confidence_score": f"High ({len(news_articles) + len(reviews)} sources analyzed)",
|
| 359 |
+
"greenwashingLabel": greenwashing_flag, # 1 if Greenwashing, else 0 (Simplification for some binary UIs)
|
| 360 |
+
|
| 361 |
+
"detailed_scores": {
|
| 362 |
+
"green_keyword_frequency": round(green_keyword_freq, 3),
|
| 363 |
+
"vague_keyword_ratio": round(vague_ratio, 3),
|
| 364 |
+
"concrete_claim_ratio": round(concrete_ratio, 3),
|
| 365 |
+
"overall_sentiment": round(composite_score_norm, 3),
|
| 366 |
+
"internal_sentiment": round(internal_sentiment['score'], 3),
|
| 367 |
+
"external_sentiment": round(news_sentiment['score'], 3),
|
| 368 |
+
"external_sentiment_gap": round(ext_gap, 3),
|
| 369 |
+
"emission_sentiment": round(emission_sentiment['score'], 3),
|
| 370 |
+
"energy_sentiment": round(energy_sentiment['score'], 3),
|
| 371 |
+
"waste_sentiment": round(waste_sentiment['score'], 3),
|
| 372 |
+
"relative_focus_score": round(pdf_scores['env_count'] / max(len(pdf_sentences), 1), 3)
|
| 373 |
+
},
|
| 374 |
+
|
| 375 |
+
"external_summary": {
|
| 376 |
+
"key_highlights": [
|
| 377 |
+
f"Public Sentiment: {news_sentiment['label']}",
|
| 378 |
+
f"Risk Level: {overall_risk_str}"
|
| 379 |
+
],
|
| 380 |
+
# ...
|
| 381 |
+
"public_sentiment": news_sentiment['label'],
|
| 382 |
+
"recent_news_summary": f"Analysis of {len(news_articles)} articles.",
|
| 383 |
+
"possible_bias": "None",
|
| 384 |
+
"evidence_links": news_articles[:5]
|
| 385 |
+
},
|
| 386 |
+
|
| 387 |
+
"internal_documents_analysis": {
|
| 388 |
+
"major_findings": pdf_scores['env_sentences'][:5],
|
| 389 |
+
"compliance_risks": [f"Potential risk: {s[:50]}..." for s in pdf_scores['env_sentences'] if "aims to" in s][:3],
|
| 390 |
+
"performance_indicators": [s for s in pdf_scores['action_sentences'] if "%" in s][:5]
|
| 391 |
+
},
|
| 392 |
+
|
| 393 |
+
"risk_assessment": {
|
| 394 |
+
"financial_risk": "High" if risk_level_code == 2 else "Low",
|
| 395 |
+
"reputation_risk": "Critical" if risk_level_code == 2 else ("Medium" if risk_level_code == 1 else "Low"),
|
| 396 |
+
"compliance_risk": "High" if risk_level_code == 2 else "Low",
|
| 397 |
+
"market_risk": "Medium" if final_score < 50 else "Low",
|
| 398 |
+
# IMPACT: 3-State Output
|
| 399 |
+
"overall_risk_level": overall_risk_str
|
| 400 |
+
},
|
| 401 |
+
|
| 402 |
+
# ... (rest same) ...
|
| 403 |
+
"opportunities_and_strengths": [
|
| 404 |
+
"Expand concrete data reporting",
|
| 405 |
+
"Address external contradictions explicitly"
|
| 406 |
+
] if risk_level_code >= 1 else [
|
| 407 |
+
"Strong concrete data transparency",
|
| 408 |
+
"Positive external sentiment alignment"
|
| 409 |
+
],
|
| 410 |
+
|
| 411 |
+
"reviews_analysis": {
|
| 412 |
+
"sentiment_score": reviews_sentiment['score'],
|
| 413 |
+
"total_reviews_analyzed": len(reviews),
|
| 414 |
+
"review_sources": reviews[:5]
|
| 415 |
+
},
|
| 416 |
+
|
| 417 |
+
"recommended_actions": ai_recommendations,
|
| 418 |
+
|
| 419 |
+
"hidden_patterns": [
|
| 420 |
+
{"pattern": "Vague Language", "description": "High usage of 'aims to' without dates"}
|
| 421 |
+
] if vague_ratio > 0.4 else []
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
report_progress(f"Analysis complete: Score {final_score}/100", 100)
|
| 425 |
+
return result
|
app/services/hugchat_client.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from hugchat import hugchat
|
| 3 |
+
from hugchat.login import Login
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
HF_EMAIL = os.getenv("HUGGINGFACE_EMAIL")
|
| 9 |
+
HF_PASS = os.getenv("HUGGINGFACE_PASS")
|
| 10 |
+
|
| 11 |
+
# Global variables to reuse login session
|
| 12 |
+
_chatbot = None
|
| 13 |
+
|
| 14 |
+
def get_chatbot():
|
| 15 |
+
global _chatbot
|
| 16 |
+
if _chatbot:
|
| 17 |
+
return _chatbot
|
| 18 |
+
|
| 19 |
+
if not HF_EMAIL or not HF_PASS:
|
| 20 |
+
print("Warning: HUGGINGFACE_EMAIL or HUGGINGFACE_PASS not found.")
|
| 21 |
+
return None
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
sign = Login(HF_EMAIL, HF_PASS)
|
| 25 |
+
cookies = sign.login()
|
| 26 |
+
_chatbot = hugchat.ChatBot(cookies=cookies.get_dict())
|
| 27 |
+
return _chatbot
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"HuggingChat Login Error: {e}")
|
| 30 |
+
return None
|
| 31 |
+
|
| 32 |
+
def generate_hugchat_response(prompt: str) -> str:
|
| 33 |
+
"""
|
| 34 |
+
Generates text using HuggingChat.
|
| 35 |
+
"""
|
| 36 |
+
chatbot = get_chatbot()
|
| 37 |
+
if not chatbot:
|
| 38 |
+
return "AI unavailable (Auth missing)."
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
# Create a new conversation for isolation or reuse default
|
| 42 |
+
id = chatbot.new_conversation()
|
| 43 |
+
chatbot.change_conversation(id)
|
| 44 |
+
|
| 45 |
+
response = chatbot.chat(prompt)
|
| 46 |
+
text = response.wait_until_done()
|
| 47 |
+
|
| 48 |
+
# Cleanup? (Optional, but good for privacy)
|
| 49 |
+
# chatbot.delete_conversation(id)
|
| 50 |
+
|
| 51 |
+
return text
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f"HuggingChat Error: {e}")
|
| 54 |
+
return "AI unavailable (Error)."
|
app/services/llm_generator.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import google.generativeai as genai
|
| 2 |
+
import os
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 8 |
+
|
| 9 |
+
# if GEMINI_API_KEY:
|
| 10 |
+
# genai.configure(api_key=GEMINI_API_KEY)
|
| 11 |
+
# model = genai.GenerativeModel('gemini-2.0-flash')
|
| 12 |
+
# else:
|
| 13 |
+
model = None
|
| 14 |
+
print("Gemini LLM Disabled by user request.")
|
| 15 |
+
# print("Warning: GEMINI_API_KEY not found in .env. LLM features will be disabled.")
|
| 16 |
+
|
| 17 |
+
def generate_company_description(company_name: str) -> str:
|
| 18 |
+
"""
|
| 19 |
+
Generates a brief 2-3 sentence description of the company using Gemini.
|
| 20 |
+
"""
|
| 21 |
+
if not model:
|
| 22 |
+
return "AI description unavailable (API Key missing)."
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
prompt = f"Provide a factual, neutral 2-3 sentence description of the company '{company_name}', focusing on its industry and main products. Do not mention sentiment or controversies."
|
| 26 |
+
response = model.generate_content(prompt)
|
| 27 |
+
return response.text.strip()
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"Error generating description for {company_name}: {e}")
|
| 30 |
+
return "AI description unavailable due to an error."
|
| 31 |
+
|
| 32 |
+
def generate_ai_recommendations(company_name: str, analysis_data: dict) -> dict:
|
| 33 |
+
"""
|
| 34 |
+
Generates tailored recommendations for Customers, Investors, and Leadership based on the analysis.
|
| 35 |
+
"""
|
| 36 |
+
if not model:
|
| 37 |
+
return {
|
| 38 |
+
"for_customers": ["Review provided evidence links."],
|
| 39 |
+
"for_investors": ["Analyze financial risks mentioned in report."],
|
| 40 |
+
"for_company_leadership": ["Address flagged contradictions."]
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
# Construct a summary context for the LLM
|
| 45 |
+
context = f"""
|
| 46 |
+
Company: {company_name}
|
| 47 |
+
Greenwashing Risk: {'High' if analysis_data.get('greenwashingLabel') == 1 else 'Low'}
|
| 48 |
+
Reason: {analysis_data.get('internal_documents_analysis', {}).get('major_findings', ['N/A'])[0]}
|
| 49 |
+
Contradictions: {len(analysis_data.get('contradictions_detected', []))} found.
|
| 50 |
+
Sentiment: {analysis_data.get('external_summary', {}).get('public_sentiment', 'N/A')}
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
prompt = f"""
|
| 54 |
+
Based on the following analysis of '{company_name}', provide 3 specific, actionable recommendations for each group (Customers, Investors, Leadership).
|
| 55 |
+
Focus on greenwashing, transparency, and sustainability accountability.
|
| 56 |
+
|
| 57 |
+
Analysis Context:
|
| 58 |
+
{context}
|
| 59 |
+
|
| 60 |
+
Output purely as JSON format with keys: "for_customers", "for_investors", "for_company_leadership". Each key should have a list of strings.
|
| 61 |
+
Do not allow Markdown code blocks. Just raw JSON.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
response = model.generate_content(prompt)
|
| 65 |
+
text = response.text.strip()
|
| 66 |
+
|
| 67 |
+
# Clean potential markdown wrapping
|
| 68 |
+
if text.startswith("```json"):
|
| 69 |
+
text = text[7:]
|
| 70 |
+
if text.endswith("```"):
|
| 71 |
+
text = text[:-3]
|
| 72 |
+
|
| 73 |
+
import json
|
| 74 |
+
return json.loads(text)
|
| 75 |
+
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f"Error generating recommendations for {company_name}: {e}")
|
| 78 |
+
# Fallback
|
| 79 |
+
return {
|
| 80 |
+
"for_customers": ["Review provided evidence links.", "Cross-check claims."],
|
| 81 |
+
"for_investors": ["Monitor reputational risks.", "Demand clearer impact reports."],
|
| 82 |
+
"for_company_leadership": ["Address detected contradictions.", "Improve transparency."]
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
def generate_combined_insights(company_name: str, analysis_data: dict) -> dict:
|
| 86 |
+
"""
|
| 87 |
+
Combines description and recommendations into a single API call to reduce rate limit usage.
|
| 88 |
+
Returns: { "description": str, "recommendations": dict }
|
| 89 |
+
"""
|
| 90 |
+
if not model:
|
| 91 |
+
return {
|
| 92 |
+
"description": "AI description unavailable (API Key missing).",
|
| 93 |
+
"recommendations": generate_ai_recommendations(company_name, analysis_data) # Fallback to default
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
context = f"""
|
| 98 |
+
Company: {company_name}
|
| 99 |
+
Greenwashing Risk: {'High' if analysis_data.get('greenwashingLabel') == 1 else 'Low'}
|
| 100 |
+
Reason: {analysis_data.get('internal_documents_analysis', {}).get('major_findings', ['N/A'])[0]}
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
+
prompt = f"""
|
| 104 |
+
Analyze '{company_name}' based on this context:
|
| 105 |
+
{context}
|
| 106 |
+
|
| 107 |
+
Provide 2 outputs in a single JSON object:
|
| 108 |
+
1. "description": A factual 2-sentence description of the company.
|
| 109 |
+
2. "recommendations": A dictionary with keys "for_customers", "for_investors", "for_company_leadership", containing 3 actionable tips for each.
|
| 110 |
+
|
| 111 |
+
Output purely JSON. No markdown.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
response = model.generate_content(prompt)
|
| 115 |
+
text = response.text.strip()
|
| 116 |
+
if text.startswith("```json"): text = text[7:]
|
| 117 |
+
if text.endswith("```"): text = text[:-3]
|
| 118 |
+
|
| 119 |
+
return json.loads(text)
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"Error generating combined insights for {company_name}: {e}")
|
| 122 |
+
return {
|
| 123 |
+
"description": "AI description unavailable due to high traffic.",
|
| 124 |
+
"recommendations": {
|
| 125 |
+
"for_customers": ["Review evidence links."],
|
| 126 |
+
"for_investors": ["Analyze risks."],
|
| 127 |
+
"for_company_leadership": ["Address contradictions."]
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
def generate_batch_insights(companies_data: list) -> dict:
|
| 132 |
+
"""
|
| 133 |
+
Generates insights for a batch of companies (up to 10-15 recommended) in a SINGLE prompt.
|
| 134 |
+
Input: list of {name, context: str}
|
| 135 |
+
Output: dict { company_name: { "description": ..., "recommendations": ... } }
|
| 136 |
+
"""
|
| 137 |
+
import json
|
| 138 |
+
from .hugchat_client import generate_hugchat_response
|
| 139 |
+
|
| 140 |
+
# Try HuggingChat if Gemini is disabled
|
| 141 |
+
if not model:
|
| 142 |
+
# Construct Prompt for HuggingChat
|
| 143 |
+
batch_context = ""
|
| 144 |
+
for i, c in enumerate(companies_data):
|
| 145 |
+
batch_context += f"\n--- Company {i+1}: {c['name']} ---\n{c['context']}\n"
|
| 146 |
+
|
| 147 |
+
prompt = f"""
|
| 148 |
+
You are a sustainability analyst. Analyze these {len(companies_data)} companies.
|
| 149 |
+
{batch_context}
|
| 150 |
+
|
| 151 |
+
Return a valid JSON OBJECT where keys are company names.
|
| 152 |
+
For each company, provide:
|
| 153 |
+
1. "description": A factual 2-sentence summary.
|
| 154 |
+
2. "recommendations": Object with keys "for_customers", "for_investors", "for_company_leadership" (list of 3 tips each).
|
| 155 |
+
|
| 156 |
+
Example JSON Structure:
|
| 157 |
+
{{
|
| 158 |
+
"Company Name": {{
|
| 159 |
+
"description": "...",
|
| 160 |
+
"recommendations": {{ "for_customers": [...], ... }}
|
| 161 |
+
}}
|
| 162 |
+
}}
|
| 163 |
+
|
| 164 |
+
IMPORTANT: Output ONLY valid JSON. No Markdown. No Intro.
|
| 165 |
+
"""
|
| 166 |
+
|
| 167 |
+
print("Using HuggingChat for Batch Analysis...")
|
| 168 |
+
response_text = generate_hugchat_response(prompt)
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
# clean json
|
| 172 |
+
text = response_text.strip()
|
| 173 |
+
if text.startswith("```json"): text = text[7:]
|
| 174 |
+
if text.endswith("```"): text = text[:-3]
|
| 175 |
+
if "{" not in text: raise Exception("Invalid JSON format")
|
| 176 |
+
|
| 177 |
+
return json.loads(text)
|
| 178 |
+
except Exception as e:
|
| 179 |
+
print(f"HuggingChat Parsing Error: {e}")
|
| 180 |
+
# Fallthrogh to fallback
|
| 181 |
+
pass
|
| 182 |
+
|
| 183 |
+
if not model and not 'response_text' in locals():
|
| 184 |
+
# Return fallback for all
|
| 185 |
+
return {c['name']: {
|
| 186 |
+
"description": "AI unavailable (Key missing)",
|
| 187 |
+
"recommendations": {
|
| 188 |
+
"for_customers": ["Review evidence."],
|
| 189 |
+
"for_investors": ["Check risks."],
|
| 190 |
+
"for_company_leadership": ["Monitor compliance."]
|
| 191 |
+
}
|
| 192 |
+
} for c in companies_data}
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
# ... (Gemini Logic remains as backup if re-enabled) ...
|
| 196 |
+
# Construct simplified context list
|
| 197 |
+
batch_context = ""
|
| 198 |
+
for i, c in enumerate(companies_data):
|
| 199 |
+
batch_context += f"\n--- Company {i+1}: {c['name']} ---\n{c['context']}\n"
|
| 200 |
+
|
| 201 |
+
prompt = f"""
|
| 202 |
+
Analyze the following {len(companies_data)} companies based on the provided contexts.
|
| 203 |
+
{batch_context}
|
| 204 |
+
|
| 205 |
+
For EACH company, provide:
|
| 206 |
+
1. "description": A factual 2-sentence summary.
|
| 207 |
+
2. "recommendations": 3 specific actionable tips per group (Customers, Investors, Leadership).
|
| 208 |
+
|
| 209 |
+
Output purely as a JSON OBJECT where keys are the exact company names and values are the insight objects.
|
| 210 |
+
Example:
|
| 211 |
+
{{
|
| 212 |
+
"Company A": {{ "description": "...", "recommendations": {{ ... }} }},
|
| 213 |
+
"Company B": ...
|
| 214 |
+
}}
|
| 215 |
+
|
| 216 |
+
No markdown formatting. Just JSON.
|
| 217 |
+
"""
|
| 218 |
+
|
| 219 |
+
response = model.generate_content(prompt)
|
| 220 |
+
text = response.text.strip()
|
| 221 |
+
if text.startswith("```json"): text = text[7:]
|
| 222 |
+
if text.endswith("```"): text = text[:-3]
|
| 223 |
+
|
| 224 |
+
results = json.loads(text)
|
| 225 |
+
return results
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f"Batch generation error: {e}")
|
| 229 |
+
return {}
|
app/services/ml_logic.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .ml_models import ml_models
|
| 2 |
+
from .scoring import calculate_vague_score, calculate_concrete_score, analyze_sentiment
|
| 3 |
+
import re
|
| 4 |
+
import joblib
|
| 5 |
+
import os
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
# Path configurations
|
| 10 |
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 11 |
+
MODEL_DIR = os.path.join(BASE_DIR, "ml_models")
|
| 12 |
+
|
| 13 |
+
ENSEMBLE_PATH = os.path.join(MODEL_DIR, 'ensemble_model.pkl')
|
| 14 |
+
FEATURE_COLS_PATH = os.path.join(MODEL_DIR, 'all_feature_columns.pkl')
|
| 15 |
+
CAT_MAPPING_PATH = os.path.join(MODEL_DIR, 'category_to_greenwashing_mapping.pkl')
|
| 16 |
+
BINARY_MAPPING_PATH = os.path.join(MODEL_DIR, 'binary_to_report_name_mapping.pkl')
|
| 17 |
+
|
| 18 |
+
_ensemble_model = None
|
| 19 |
+
_feature_cols = None
|
| 20 |
+
_binary_mapping = None
|
| 21 |
+
|
| 22 |
+
def load_artifacts():
|
| 23 |
+
global _ensemble_model, _feature_cols, _binary_mapping
|
| 24 |
+
|
| 25 |
+
if _ensemble_model and _feature_cols:
|
| 26 |
+
return _ensemble_model, _feature_cols, _binary_mapping
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
if os.path.exists(ENSEMBLE_PATH):
|
| 30 |
+
print(f"[ML] Loading Ensemble Model from {ENSEMBLE_PATH}...")
|
| 31 |
+
_ensemble_model = joblib.load(ENSEMBLE_PATH)
|
| 32 |
+
_feature_cols = joblib.load(FEATURE_COLS_PATH)
|
| 33 |
+
|
| 34 |
+
if os.path.exists(BINARY_MAPPING_PATH):
|
| 35 |
+
_binary_mapping = joblib.load(BINARY_MAPPING_PATH)
|
| 36 |
+
else:
|
| 37 |
+
# Fallback mapping if file missing
|
| 38 |
+
_binary_mapping = {0: 'Not Greenwashing (Low)', 1: 'Greenwashing (High/Medium)'}
|
| 39 |
+
|
| 40 |
+
print(f"[ML] Ensemble Model Loaded. Features: {_feature_cols}")
|
| 41 |
+
return _ensemble_model, _feature_cols, _binary_mapping
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"[ML] Failed to load artifacts: {e}")
|
| 44 |
+
|
| 45 |
+
return None, None, None
|
| 46 |
+
|
| 47 |
+
def train_model(data: list[dict]):
|
| 48 |
+
"""
|
| 49 |
+
Legacy training function kept for compatibility but effectively disabled
|
| 50 |
+
as we are now using the pre-trained Ensemble Model.
|
| 51 |
+
"""
|
| 52 |
+
print("[ML] Train requested, but system is now using pre-trained Ensemble Model.")
|
| 53 |
+
return 0.0
|
| 54 |
+
|
| 55 |
+
def predict_greenwashing_risk(text, company_name="Unknown", features_dict=None):
|
| 56 |
+
"""
|
| 57 |
+
Predict greenwashing risk using Ensemble Model if features are provided.
|
| 58 |
+
Fallback to heuristic if only text is available.
|
| 59 |
+
"""
|
| 60 |
+
model, features, binary_map = load_artifacts()
|
| 61 |
+
|
| 62 |
+
# 1. Prediction using Ensemble Model (Feature-based)
|
| 63 |
+
if model and features and features_dict:
|
| 64 |
+
try:
|
| 65 |
+
# Prepare input dataframe with correct column order
|
| 66 |
+
input_data = {}
|
| 67 |
+
for col in features:
|
| 68 |
+
# Handle typo in specific user column "frequecy"
|
| 69 |
+
val = features_dict.get(col)
|
| 70 |
+
if val is None:
|
| 71 |
+
# Fallback for known variations
|
| 72 |
+
if col == 'Green Keyword frequecy':
|
| 73 |
+
val = features_dict.get('Green Keyword Frequency', 0)
|
| 74 |
+
elif col == 'Emission Sentiment ': # Note space
|
| 75 |
+
val = features_dict.get('Emission Sentiment', 0)
|
| 76 |
+
else:
|
| 77 |
+
val = 0
|
| 78 |
+
input_data[col] = [float(val)]
|
| 79 |
+
|
| 80 |
+
df = pd.DataFrame(input_data)
|
| 81 |
+
|
| 82 |
+
# Predict
|
| 83 |
+
pred_binary = model.predict(df)[0]
|
| 84 |
+
pred_proba = model.predict_proba(df)[0] # [prob_0, prob_1]
|
| 85 |
+
prob_gw = pred_proba[1]
|
| 86 |
+
|
| 87 |
+
# granular mapping based on probability
|
| 88 |
+
if prob_gw >= 0.75:
|
| 89 |
+
risk_label = "High"
|
| 90 |
+
label_text = "High Risk"
|
| 91 |
+
elif prob_gw >= 0.35:
|
| 92 |
+
risk_label = "Medium"
|
| 93 |
+
label_text = "Medium Risk"
|
| 94 |
+
else:
|
| 95 |
+
risk_label = "Low"
|
| 96 |
+
label_text = "Low Risk"
|
| 97 |
+
|
| 98 |
+
return {
|
| 99 |
+
"company_name": company_name,
|
| 100 |
+
"greenwashing_score": round(prob_gw, 3),
|
| 101 |
+
"risk_label": risk_label,
|
| 102 |
+
"model_label": risk_label, # Use simple label for UI mapping
|
| 103 |
+
"details": {
|
| 104 |
+
"model_used": "Ensemble Voting Classifier",
|
| 105 |
+
"confidence": round(max(pred_proba) * 100, 1),
|
| 106 |
+
"features": features_dict # Return original features for UI
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"[ML] Ensemble prediction failed: {e}")
|
| 112 |
+
# Fallback to heuristic below
|
| 113 |
+
|
| 114 |
+
# 2. Heuristic Fallback (Text-based)
|
| 115 |
+
sentences = re.split(r'(?<=[.!?]) +', text)
|
| 116 |
+
vague_score = calculate_vague_score(sentences)
|
| 117 |
+
concrete_score = calculate_concrete_score(sentences)
|
| 118 |
+
sentiment = analyze_sentiment([text])
|
| 119 |
+
|
| 120 |
+
risk_score = 0.5 + (vague_score * 0.4) - (concrete_score * 0.5)
|
| 121 |
+
if sentiment['label'] == 'Negative':
|
| 122 |
+
risk_score += sentiment['score'] * 0.2
|
| 123 |
+
risk_score = max(0, min(1, risk_score))
|
| 124 |
+
|
| 125 |
+
return {
|
| 126 |
+
"company_name": company_name,
|
| 127 |
+
"greenwashing_score": round(risk_score, 3),
|
| 128 |
+
"risk_label": "High Risk" if risk_score > 0.7 else "Low Risk",
|
| 129 |
+
"model_label": "Heuristic Analysis",
|
| 130 |
+
"details": {
|
| 131 |
+
"vague_language_ratio": round(vague_score, 3),
|
| 132 |
+
"concrete_claims_ratio": round(concrete_score, 3),
|
| 133 |
+
"model_used": "Heuristic Fallback"
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
|
app/services/ml_models.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
class MLModels:
|
| 6 |
+
_instance = None
|
| 7 |
+
|
| 8 |
+
def __new__(cls):
|
| 9 |
+
if cls._instance is None:
|
| 10 |
+
cls._instance = super(MLModels, cls).__new__(cls)
|
| 11 |
+
cls._instance.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 12 |
+
print(f"Loading models on {cls._instance.device}...")
|
| 13 |
+
|
| 14 |
+
# Load Sentence Transformer
|
| 15 |
+
cls._instance.st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=cls._instance.device)
|
| 16 |
+
|
| 17 |
+
# Load FinBERT for sentiment
|
| 18 |
+
cls._instance.finbert = pipeline("text-classification", model="yiyanghkust/finbert-tone", device=0 if cls._instance.device == 'cuda' else -1)
|
| 19 |
+
|
| 20 |
+
# Load ClimateBERT for ESG sentiment (optional, can be heavy)
|
| 21 |
+
# cls._instance.climatebert = pipeline("text-classification", model="climatebert/distilroberta-base-climate-sentiment", device=0 if cls._instance.device == 'cuda' else -1)
|
| 22 |
+
|
| 23 |
+
print("Models loaded successfully.")
|
| 24 |
+
return cls._instance
|
| 25 |
+
|
| 26 |
+
ml_models = MLModels()
|
app/services/pdf_processor.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
def extract_text_from_pdf(pdf_path: str) -> str:
|
| 5 |
+
text = ""
|
| 6 |
+
try:
|
| 7 |
+
with fitz.open(pdf_path) as doc:
|
| 8 |
+
for page in doc:
|
| 9 |
+
text += page.get_text()
|
| 10 |
+
except Exception as e:
|
| 11 |
+
print(f"Error reading PDF {pdf_path}: {e}")
|
| 12 |
+
return ""
|
| 13 |
+
return text
|
| 14 |
+
|
| 15 |
+
def split_sentences(text: str) -> list[str]:
|
| 16 |
+
# Simple regex split on punctuation followed by space
|
| 17 |
+
return re.split(r'(?<=[.!?])\s+', text)
|
| 18 |
+
|
| 19 |
+
def clean_text(text: str) -> str:
|
| 20 |
+
text = re.sub(r"\s+", " ", str(text)).strip()
|
| 21 |
+
return text
|
app/services/perplexity_client.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
|
| 9 |
+
|
| 10 |
+
def research_company(company_name: str) -> dict:
|
| 11 |
+
"""
|
| 12 |
+
Uses Perplexity AI to conduct deep web research on a company's environmental impact.
|
| 13 |
+
Returns: { "description": str, "findings": list, "sentiment": str, "citations": list }
|
| 14 |
+
"""
|
| 15 |
+
if not PERPLEXITY_API_KEY:
|
| 16 |
+
print("Warning: PERPLEXITY_API_KEY not found.")
|
| 17 |
+
return None
|
| 18 |
+
|
| 19 |
+
url = "https://api.perplexity.ai/chat/completions"
|
| 20 |
+
|
| 21 |
+
# Prompt designed to extract structured data compatible with our existing analysis
|
| 22 |
+
system_prompt = "You are an environmental analyst. Research the target company and return a JSON object with: 'description' (factual summaries), 'findings' (list of 5 key controversies or achievements), 'sentiment' (Positive/Negative/Mixed), 'citations' (list of source URLs), and 'recommendations' (object with keys 'for_customers', 'for_investors', 'for_company_leadership', each a list of 3 strings)."
|
| 23 |
+
|
| 24 |
+
user_prompt = f"Research the environmental track record of '{company_name}'. Focus on emissions, greenwashing, and sustainability 2023-2025."
|
| 25 |
+
|
| 26 |
+
payload = {
|
| 27 |
+
"model": "sonar",
|
| 28 |
+
"messages": [
|
| 29 |
+
{"role": "system", "content": system_prompt},
|
| 30 |
+
{"role": "user", "content": user_prompt}
|
| 31 |
+
],
|
| 32 |
+
"temperature": 0.2
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
headers = {
|
| 36 |
+
"Authorization": f"Bearer {PERPLEXITY_API_KEY}",
|
| 37 |
+
"Content-Type": "application/json"
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
response = requests.post(url, json=payload, headers=headers)
|
| 42 |
+
response.raise_for_status()
|
| 43 |
+
|
| 44 |
+
result = response.json()
|
| 45 |
+
content = result['choices'][0]['message']['content']
|
| 46 |
+
citations = result.get('citations', [])
|
| 47 |
+
|
| 48 |
+
# Clean JSON markdown if present
|
| 49 |
+
if content.startswith("```json"): content = content[7:]
|
| 50 |
+
if content.endswith("```"): content = content[:-3]
|
| 51 |
+
|
| 52 |
+
data = json.loads(content)
|
| 53 |
+
data['citations'] = citations # Ensure citations are attached
|
| 54 |
+
return data
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"Perplexity API Error: {e}")
|
| 58 |
+
return None
|
app/services/scoring.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from sentence_transformers import util
|
| 3 |
+
from .ml_models import ml_models
|
| 4 |
+
|
| 5 |
+
# Reference phrases
|
| 6 |
+
ENV_REF = [
|
| 7 |
+
"environment", "climate change", "carbon emissions", "pollution", "waste",
|
| 8 |
+
"green energy", "renewable resources", "sustainability", "biodiversity",
|
| 9 |
+
"eco-friendly", "net zero", "solar energy", "wind energy", "water conservation"
|
| 10 |
+
]
|
| 11 |
+
ESG_REF = [
|
| 12 |
+
"environment", "social responsibility", "governance", "sustainability", "carbon emissions",
|
| 13 |
+
"green energy", "renewable resources", "waste management", "climate change", "pollution control",
|
| 14 |
+
"biodiversity", "eco-friendly", "net zero", "solar energy", "wind energy", "water conservation",
|
| 15 |
+
"community development", "employee welfare", "diversity", "ethics"
|
| 16 |
+
]
|
| 17 |
+
ACTION_REF = [
|
| 18 |
+
"implemented", "adopted", "reduced emissions", "recycled", "renewable energy",
|
| 19 |
+
"sustainability project", "steps taken to reduce carbon emissions",
|
| 20 |
+
"initiatives to help the environment", "measures to prevent greenwashing"
|
| 21 |
+
]
|
| 22 |
+
CLAIM_REF = [
|
| 23 |
+
"plans to achieve", "committed to", "targets", "pledges", "goal", "aims to",
|
| 24 |
+
"intent to reduce", "objective to be", "aims for sustainability",
|
| 25 |
+
"pledged to achieve", "will reduce carbon", "expect to reach net zero",
|
| 26 |
+
"plans to be carbon neutral by", "commitment to net zero by",
|
| 27 |
+
"goal to be eco friendly by", "target year for sustainability",
|
| 28 |
+
"striving to be net zero", "intends to adopt renewable energy", "aiming for eco-friendly operations"
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
def semantic_matches(sentences, reference, threshold=0.55, batch_size=64):
|
| 32 |
+
model = ml_models.st_model
|
| 33 |
+
ref_emb = model.encode(reference, convert_to_tensor=True)
|
| 34 |
+
matches = []
|
| 35 |
+
|
| 36 |
+
# Process in batches
|
| 37 |
+
for i in range(0, len(sentences), batch_size):
|
| 38 |
+
batch = sentences[i:i+batch_size]
|
| 39 |
+
if not batch: continue
|
| 40 |
+
sent_emb = model.encode(batch, convert_to_tensor=True)
|
| 41 |
+
sim_matrix = util.cos_sim(sent_emb, ref_emb)
|
| 42 |
+
|
| 43 |
+
for j, sim_scores in enumerate(sim_matrix):
|
| 44 |
+
if sim_scores.max().item() >= threshold:
|
| 45 |
+
matches.append(batch[j].strip())
|
| 46 |
+
|
| 47 |
+
return matches if matches else []
|
| 48 |
+
|
| 49 |
+
def calculate_scores(sentences):
|
| 50 |
+
env_sentences = semantic_matches(sentences, ENV_REF)
|
| 51 |
+
esg_sentences = semantic_matches(sentences, ESG_REF)
|
| 52 |
+
action_sentences = semantic_matches(sentences, ACTION_REF)
|
| 53 |
+
claim_sentences = semantic_matches(sentences, CLAIM_REF, threshold=0.54)
|
| 54 |
+
|
| 55 |
+
return {
|
| 56 |
+
"env_count": len(env_sentences),
|
| 57 |
+
"esg_count": len(esg_sentences),
|
| 58 |
+
"action_count": len(action_sentences),
|
| 59 |
+
"claim_count": len(claim_sentences),
|
| 60 |
+
"env_sentences": env_sentences,
|
| 61 |
+
"action_sentences": action_sentences
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
def calculate_vague_score(sentences):
|
| 65 |
+
"""
|
| 66 |
+
Calculate the ratio of sentences containing vague/future-tense language.
|
| 67 |
+
"""
|
| 68 |
+
vague_patterns = [
|
| 69 |
+
r"aim(s|ing)? to", r"plan(s|ning)? to", r"committed to", r"strive(s|ing)? for",
|
| 70 |
+
r"intend(s|ing)? to", r"goal of", r"vision", r"hopefully", r"aspire(s|ing)? to",
|
| 71 |
+
r"future", r"potential", r"believe"
|
| 72 |
+
]
|
| 73 |
+
regex = re.compile("|".join(vague_patterns), re.IGNORECASE)
|
| 74 |
+
|
| 75 |
+
count = 0
|
| 76 |
+
for sent in sentences:
|
| 77 |
+
if regex.search(sent):
|
| 78 |
+
count += 1
|
| 79 |
+
|
| 80 |
+
return count / max(len(sentences), 1)
|
| 81 |
+
|
| 82 |
+
def calculate_concrete_score(sentences):
|
| 83 |
+
"""
|
| 84 |
+
Calculate the ratio of sentences containing specific, concrete metrics.
|
| 85 |
+
Looking for numbers followed by %, $, tons, kg, or years.
|
| 86 |
+
"""
|
| 87 |
+
concrete_patterns = [
|
| 88 |
+
r"\d+(\.\d+)?%", # Percentages
|
| 89 |
+
r"\$\d+", # Money
|
| 90 |
+
r"\d+ (tons|kg|metric tons|tonnes)", # Weight
|
| 91 |
+
r"by 20\d{2}", # Years (e.g. by 2030)
|
| 92 |
+
r"reduced by", r"achieved", r"completed" # Past tense concrete verbs
|
| 93 |
+
]
|
| 94 |
+
regex = re.compile("|".join(concrete_patterns), re.IGNORECASE)
|
| 95 |
+
|
| 96 |
+
count = 0
|
| 97 |
+
for sent in sentences:
|
| 98 |
+
if regex.search(sent):
|
| 99 |
+
count += 1
|
| 100 |
+
|
| 101 |
+
return count / max(len(sentences), 1)
|
| 102 |
+
|
| 103 |
+
def analyze_sentiment(text_chunks):
|
| 104 |
+
# Use FinBERT for sentiment
|
| 105 |
+
results = []
|
| 106 |
+
for chunk in text_chunks:
|
| 107 |
+
# Truncate to 1500 chars (approx 300-400 tokens) to be safe
|
| 108 |
+
if len(chunk) > 1500: chunk = chunk[:1500]
|
| 109 |
+
try:
|
| 110 |
+
res = ml_models.finbert(chunk, truncation=True, max_length=512)
|
| 111 |
+
results.append(res[0]) # [{'label': 'Positive', 'score': 0.9}]
|
| 112 |
+
except Exception as e:
|
| 113 |
+
print(f"Sentiment error: {e}")
|
| 114 |
+
|
| 115 |
+
# Aggregate
|
| 116 |
+
if not results: return {"label": "Neutral", "score": 0.5}
|
| 117 |
+
|
| 118 |
+
pos = sum(1 for r in results if r['label'] == 'Positive')
|
| 119 |
+
neg = sum(1 for r in results if r['label'] == 'Negative')
|
| 120 |
+
neu = sum(1 for r in results if r['label'] == 'Neutral')
|
| 121 |
+
|
| 122 |
+
total = len(results)
|
| 123 |
+
if pos > neg and pos > neu: return {"label": "Positive", "score": pos/total}
|
| 124 |
+
if neg > pos and neg > neu: return {"label": "Negative", "score": neg/total}
|
| 125 |
+
return {"label": "Neutral", "score": neu/total}
|
| 126 |
+
|
| 127 |
+
def analyze_aspect_sentiment(text_chunks, aspect_keywords):
|
| 128 |
+
"""
|
| 129 |
+
Analyze sentiment only for chunks containing specific keywords
|
| 130 |
+
"""
|
| 131 |
+
aspect_chunks = []
|
| 132 |
+
for chunk in text_chunks:
|
| 133 |
+
if any(keyword in chunk.lower() for keyword in aspect_keywords):
|
| 134 |
+
aspect_chunks.append(chunk)
|
| 135 |
+
|
| 136 |
+
if not aspect_chunks:
|
| 137 |
+
return {"label": "Neutral", "score": 0.5}
|
| 138 |
+
|
| 139 |
+
return analyze_sentiment(aspect_chunks)
|
app/services/scraper.py
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
import requests
|
| 4 |
+
import logging
|
| 5 |
+
from fake_useragent import UserAgent
|
| 6 |
+
try:
|
| 7 |
+
from ddgs import DDGS
|
| 8 |
+
except ImportError:
|
| 9 |
+
from duckduckgo_search import DDGS
|
| 10 |
+
from selenium import webdriver
|
| 11 |
+
from selenium.webdriver.chrome.options import Options
|
| 12 |
+
from selenium.webdriver.chrome.service import Service
|
| 13 |
+
from selenium_stealth import stealth
|
| 14 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 15 |
+
from bs4 import BeautifulSoup
|
| 16 |
+
|
| 17 |
+
# Configure logging
|
| 18 |
+
logging.basicConfig(level=logging.INFO)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
ua = UserAgent()
|
| 22 |
+
|
| 23 |
+
# Progress tracking
|
| 24 |
+
progress_callback = None
|
| 25 |
+
|
| 26 |
+
def set_progress_callback(callback):
|
| 27 |
+
"""Set a callback function to report progress"""
|
| 28 |
+
global progress_callback
|
| 29 |
+
progress_callback = callback
|
| 30 |
+
|
| 31 |
+
def report_progress(message, percentage):
|
| 32 |
+
"""Report progress if callback is set"""
|
| 33 |
+
if progress_callback:
|
| 34 |
+
progress_callback(message, percentage)
|
| 35 |
+
print(f"[{percentage}%] {message}")
|
| 36 |
+
|
| 37 |
+
def setup_selenium_driver():
|
| 38 |
+
"""Setup a stealth Selenium driver with HuggingFace/Docker compatibility"""
|
| 39 |
+
options = Options()
|
| 40 |
+
options.add_argument("--headless=new") # New headless mode
|
| 41 |
+
options.add_argument("--no-sandbox")
|
| 42 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 43 |
+
options.add_argument("--disable-gpu")
|
| 44 |
+
options.add_argument("--disable-extensions")
|
| 45 |
+
options.add_argument("--disable-infobars")
|
| 46 |
+
options.add_argument("--window-size=1920,1080")
|
| 47 |
+
options.add_argument(f"user-agent={ua.random}")
|
| 48 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 49 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 50 |
+
|
| 51 |
+
# Check if running in Docker/HuggingFace environment
|
| 52 |
+
is_docker = os.path.exists("/.dockerenv") or os.environ.get("HF_SPACE_ID")
|
| 53 |
+
|
| 54 |
+
driver = None
|
| 55 |
+
|
| 56 |
+
if is_docker:
|
| 57 |
+
logger.info("Running in Docker/HuggingFace environment, using system Chromium")
|
| 58 |
+
# Use system Chromium in Docker
|
| 59 |
+
chromium_paths = ["/usr/bin/chromium", "/usr/bin/chromium-browser", "/usr/bin/google-chrome"]
|
| 60 |
+
chromedriver_paths = ["/usr/bin/chromedriver", "/usr/local/bin/chromedriver"]
|
| 61 |
+
|
| 62 |
+
for chromium_path in chromium_paths:
|
| 63 |
+
if os.path.exists(chromium_path):
|
| 64 |
+
options.binary_location = chromium_path
|
| 65 |
+
logger.info(f"Using Chromium at: {chromium_path}")
|
| 66 |
+
break
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
# Try with system chromedriver first
|
| 70 |
+
for chromedriver_path in chromedriver_paths:
|
| 71 |
+
if os.path.exists(chromedriver_path):
|
| 72 |
+
service = Service(chromedriver_path)
|
| 73 |
+
driver = webdriver.Chrome(service=service, options=options)
|
| 74 |
+
logger.info(f"Using chromedriver at: {chromedriver_path}")
|
| 75 |
+
break
|
| 76 |
+
|
| 77 |
+
if driver is None:
|
| 78 |
+
# Fallback to webdriver_manager
|
| 79 |
+
service = Service(ChromeDriverManager().install())
|
| 80 |
+
driver = webdriver.Chrome(service=service, options=options)
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.error(f"Docker Chrome setup failed: {e}")
|
| 83 |
+
# Final fallback - try default Chrome
|
| 84 |
+
try:
|
| 85 |
+
driver = webdriver.Chrome(options=options)
|
| 86 |
+
except Exception as e2:
|
| 87 |
+
logger.error(f"All Chrome drivers failed: {e2}")
|
| 88 |
+
raise
|
| 89 |
+
else:
|
| 90 |
+
# Local development - use webdriver_manager
|
| 91 |
+
try:
|
| 92 |
+
service = Service(ChromeDriverManager().install())
|
| 93 |
+
driver = webdriver.Chrome(service=service, options=options)
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.error(f"Failed to initialize Chrome driver with manager: {e}")
|
| 96 |
+
driver = webdriver.Chrome(options=options)
|
| 97 |
+
|
| 98 |
+
# Apply stealth settings
|
| 99 |
+
stealth(driver,
|
| 100 |
+
languages=["en-US", "en"],
|
| 101 |
+
vendor="Google Inc.",
|
| 102 |
+
platform="Win32",
|
| 103 |
+
webgl_vendor="Intel Inc.",
|
| 104 |
+
renderer="Intel Iris OpenGL Engine",
|
| 105 |
+
fix_hairline=True,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
return driver
|
| 109 |
+
|
| 110 |
+
async def scrape_url_selenium(url):
|
| 111 |
+
"""Scrape a URL using Selenium Stealth for better evasion"""
|
| 112 |
+
logger.info(f"Scraping with Selenium: {url}")
|
| 113 |
+
try:
|
| 114 |
+
def _selenium_task():
|
| 115 |
+
driver = setup_selenium_driver()
|
| 116 |
+
try:
|
| 117 |
+
driver.get(url)
|
| 118 |
+
# Wait for some content (simple sleep for now, could be improved with WebDriverWait)
|
| 119 |
+
import time
|
| 120 |
+
time.sleep(3)
|
| 121 |
+
content = driver.page_source
|
| 122 |
+
return content
|
| 123 |
+
finally:
|
| 124 |
+
driver.quit()
|
| 125 |
+
|
| 126 |
+
content = await asyncio.to_thread(_selenium_task)
|
| 127 |
+
|
| 128 |
+
# Parse with BS4 to get clean text
|
| 129 |
+
soup = BeautifulSoup(content, 'html.parser')
|
| 130 |
+
# Remove script and style elements
|
| 131 |
+
for script in soup(["script", "style"]):
|
| 132 |
+
script.decompose()
|
| 133 |
+
text = soup.get_text(separator=' ', strip=True)
|
| 134 |
+
return text, content
|
| 135 |
+
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logger.error(f"Selenium scraping failed for {url}: {e}")
|
| 138 |
+
return "", ""
|
| 139 |
+
|
| 140 |
+
async def search_web(query, max_results=5):
|
| 141 |
+
"""
|
| 142 |
+
Search the web using DuckDuckGo (no API key required)
|
| 143 |
+
"""
|
| 144 |
+
try:
|
| 145 |
+
results = []
|
| 146 |
+
# specific implementation for DuckDuckGo might need sync wrapper if library is sync-only
|
| 147 |
+
# DDGS().text() is synchronous generator
|
| 148 |
+
|
| 149 |
+
def run_search():
|
| 150 |
+
with DDGS() as ddgs:
|
| 151 |
+
return list(ddgs.text(query, max_results=max_results))
|
| 152 |
+
|
| 153 |
+
# Run sync search in thread
|
| 154 |
+
search_results = await asyncio.to_thread(run_search)
|
| 155 |
+
|
| 156 |
+
for res in search_results:
|
| 157 |
+
results.append({
|
| 158 |
+
"title": res.get('title', ''),
|
| 159 |
+
"url": res.get('href', ''),
|
| 160 |
+
"content": res.get('body', ''),
|
| 161 |
+
"query_type": "web_search"
|
| 162 |
+
})
|
| 163 |
+
|
| 164 |
+
return results
|
| 165 |
+
except Exception as e:
|
| 166 |
+
print(f"Search error for '{query}': {e}")
|
| 167 |
+
return []
|
| 168 |
+
|
| 169 |
+
async def get_news_from_api(company_name):
|
| 170 |
+
"""
|
| 171 |
+
Use NewsAPI for reliable news collection
|
| 172 |
+
"""
|
| 173 |
+
api_key = os.getenv('NEWS_API_KEY')
|
| 174 |
+
if not api_key:
|
| 175 |
+
return []
|
| 176 |
+
|
| 177 |
+
try:
|
| 178 |
+
url = f"https://newsapi.org/v2/everything"
|
| 179 |
+
params = {
|
| 180 |
+
'q': f'{company_name} AND (sustainability OR greenwashing OR ESG OR environmental)',
|
| 181 |
+
'language': 'en',
|
| 182 |
+
'sortBy': 'relevancy',
|
| 183 |
+
'pageSize': 15,
|
| 184 |
+
'apiKey': api_key
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
# Requests is blocking, so we run it in a thread to verify
|
| 188 |
+
response = await asyncio.to_thread(requests.get, url, params=params, timeout=10)
|
| 189 |
+
data = response.json()
|
| 190 |
+
|
| 191 |
+
if data.get('status') == 'ok':
|
| 192 |
+
articles = []
|
| 193 |
+
for article in data.get('articles', []):
|
| 194 |
+
# Filter out removed content
|
| 195 |
+
if article.get('title') == '[Removed]': continue
|
| 196 |
+
|
| 197 |
+
# KEYWORD FILTERS (Same as Web Search)
|
| 198 |
+
title_lower = (article.get('title') or "").lower()
|
| 199 |
+
desc_lower = (article.get('description') or "").lower()
|
| 200 |
+
text_to_check = title_lower + " " + desc_lower
|
| 201 |
+
|
| 202 |
+
# 1. NEGATIVE FILTER: Exclude crime/fraud
|
| 203 |
+
bad_keywords = ["fraud", "arrest", "scam", "police", "laundering", "jail", "cbi", "ed", "bribe", "punish", "litigation"]
|
| 204 |
+
if any(bad in title_lower for bad in bad_keywords):
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
# 2. POSITIVE FILTER: Must have ESG context (If query logic fails)
|
| 208 |
+
# NewsAPI query already has keywords, but let's double check to be safe
|
| 209 |
+
pass # Relying on API query "AND (sustainability OR ...)" for now
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
articles.append({
|
| 213 |
+
'url': article.get('url', ''),
|
| 214 |
+
'title': article.get('title', ''),
|
| 215 |
+
'content': (article.get('description') or '') + ' ' + (article.get('content') or ''),
|
| 216 |
+
'query_type': 'news_api'
|
| 217 |
+
})
|
| 218 |
+
return articles
|
| 219 |
+
except Exception as e:
|
| 220 |
+
print(f"NewsAPI error: {e}")
|
| 221 |
+
|
| 222 |
+
return []
|
| 223 |
+
|
| 224 |
+
# Helper for Filtering
|
| 225 |
+
def is_valid_result(res):
|
| 226 |
+
"""Filter out navigational, login, and irrelevant links"""
|
| 227 |
+
url = res.get('url', '').lower()
|
| 228 |
+
title = res.get('title', '').lower()
|
| 229 |
+
content = res.get('content', '').lower()
|
| 230 |
+
|
| 231 |
+
# 1. Exclude generic Google/Navigational links
|
| 232 |
+
invalid_domains = ['google.com/search', 'google.com/url', 'accounts.google.com', 'support.google.com',
|
| 233 |
+
'youtube.com', 'facebook.com', 'twitter.com/login', 'linkedin.com/login']
|
| 234 |
+
|
| 235 |
+
# 2. Exclude actions
|
| 236 |
+
invalid_terms = ['sign in', 'log in', 'forgot password', 'download', 'captcha', 'security check', 'robot', 'access denied']
|
| 237 |
+
|
| 238 |
+
if any(d in url for d in invalid_domains): return False
|
| 239 |
+
if any(t in title for t in invalid_terms): return False
|
| 240 |
+
|
| 241 |
+
# 3. Minimum content length/quality (for reviews)
|
| 242 |
+
# if len(content) < 20: return False # Optional rule
|
| 243 |
+
|
| 244 |
+
return True
|
| 245 |
+
|
| 246 |
+
async def get_company_news(company_name):
|
| 247 |
+
"""Get news using NewsAPI and DuckDuckGo Fallback"""
|
| 248 |
+
report_progress(f"Starting news collection for {company_name}", 10)
|
| 249 |
+
|
| 250 |
+
articles = []
|
| 251 |
+
# 1. Try NewsAPI (Limit increased to 20)
|
| 252 |
+
report_progress("Checking NewsAPI...", 15)
|
| 253 |
+
api_articles = await get_news_from_api(company_name)
|
| 254 |
+
articles.extend(api_articles)
|
| 255 |
+
|
| 256 |
+
# 2. Add Web Search (DuckDuckGo) for deeper coverage
|
| 257 |
+
report_progress("Fetching additional news via Web Search...", 25)
|
| 258 |
+
|
| 259 |
+
queries = [
|
| 260 |
+
f'"{company_name}" environmental impact report news',
|
| 261 |
+
f'"{company_name}" greenwashing controversy scandal',
|
| 262 |
+
f'"{company_name}" sustainability goals criticism',
|
| 263 |
+
f'"{company_name}" ESG rating news detected',
|
| 264 |
+
f'"{company_name}" climate change commitments review'
|
| 265 |
+
]
|
| 266 |
+
|
| 267 |
+
# ESG/Climate Keywords (Refined to avoid generic matches)
|
| 268 |
+
ESG_KEYWORDS = [
|
| 269 |
+
"climate", "carbon", "emission", "pollution", "sustainability", "esg",
|
| 270 |
+
"renewable", "net zero", "biodiversity", "ecological", "greenhouse", "fossil fuel"
|
| 271 |
+
]
|
| 272 |
+
# "green" and "environment" removed as they match "green light", "business environment"
|
| 273 |
+
|
| 274 |
+
# Negative Keywords to exclude financial crime/generic news
|
| 275 |
+
NEGATIVE_KEYWORDS = ["fraud", "arrest", "scam", "police", "laundering", "jail", "cbi", "ed", "bribe"]
|
| 276 |
+
|
| 277 |
+
for query in queries:
|
| 278 |
+
if len(articles) >= 20: break
|
| 279 |
+
|
| 280 |
+
results = await search_web(query, max_results=5)
|
| 281 |
+
for res in results:
|
| 282 |
+
if not is_valid_result(res): continue
|
| 283 |
+
|
| 284 |
+
# Combine Title + Body for checking
|
| 285 |
+
text_to_check = (res.get('title', '') + " " + res.get('body', '')).lower()
|
| 286 |
+
title_lower = res.get('title', '').lower()
|
| 287 |
+
|
| 288 |
+
# 1. NEGATIVE FILTER: Exclude crime/fraud immediately
|
| 289 |
+
if any(bad in title_lower for bad in NEGATIVE_KEYWORDS):
|
| 290 |
+
continue
|
| 291 |
+
|
| 292 |
+
# 2. POSITIVE FILTER: Must have ESG context
|
| 293 |
+
# Re-adding "environmental" specifically (not just environment)
|
| 294 |
+
if "environmental" in text_to_check: pass
|
| 295 |
+
elif not any(k in text_to_check for k in ESG_KEYWORDS):
|
| 296 |
+
continue # Skip if no environmental context found
|
| 297 |
+
|
| 298 |
+
# Simple de-duplication
|
| 299 |
+
if not any(a['url'] == res['url'] for a in articles):
|
| 300 |
+
articles.append(res)
|
| 301 |
+
|
| 302 |
+
report_progress(f"News collection complete: {len(articles)} articles", 45)
|
| 303 |
+
return articles[:20]
|
| 304 |
+
|
| 305 |
+
async def get_company_reviews(company_name):
|
| 306 |
+
"""Get reviews using Web Search (Glassdoor, Reddit, etc.)"""
|
| 307 |
+
report_progress(f"Starting review collection for {company_name}", 50)
|
| 308 |
+
|
| 309 |
+
reviews = []
|
| 310 |
+
|
| 311 |
+
# Using site: operators to force specific sources
|
| 312 |
+
queries = [
|
| 313 |
+
f'site:glassdoor.com "{company_name}" reviews "environment" OR "sustainability"',
|
| 314 |
+
f'site:reddit.com "{company_name}" greenwashing OR "toxic"',
|
| 315 |
+
f'site:trustpilot.com "{company_name}" environment',
|
| 316 |
+
f'"{company_name}" employee reviews sustainability ethics',
|
| 317 |
+
f'"{company_name}" environmental controversy reviews', # Broad fallback
|
| 318 |
+
f'"{company_name}" corporate responsibility feedback' # Broad fallback
|
| 319 |
+
]
|
| 320 |
+
|
| 321 |
+
total_queries = len(queries)
|
| 322 |
+
for idx, query in enumerate(queries):
|
| 323 |
+
progress = 50 + (idx / total_queries) * 30
|
| 324 |
+
report_progress(f"Searching specific reviews: {query}", int(progress))
|
| 325 |
+
|
| 326 |
+
results = await search_web(query, max_results=8)
|
| 327 |
+
|
| 328 |
+
for res in results:
|
| 329 |
+
if len(reviews) >= 40: break
|
| 330 |
+
if not is_valid_result(res): continue # FILTER HERE
|
| 331 |
+
|
| 332 |
+
# RELEVANCE CHECK (Strict)
|
| 333 |
+
# Ensure company name is actually mentioned in title or snippet
|
| 334 |
+
c_name_lower = company_name.lower()
|
| 335 |
+
res_content = (res.get('title', '') + " " + res.get('content', '')).lower()
|
| 336 |
+
|
| 337 |
+
# Simple substring match (can be improved with fuzzy later if needed)
|
| 338 |
+
if c_name_lower not in res_content and c_name_lower.split()[0] not in res_content:
|
| 339 |
+
# Try strict full name, then at least first word (e.g. "Google" in "Google Inc")
|
| 340 |
+
# But careful with generic first words like "The" or "Green"
|
| 341 |
+
if len(c_name_lower.split()[0]) > 3:
|
| 342 |
+
if c_name_lower.split()[0] not in res_content:
|
| 343 |
+
print(f"Skipping unrelated result: {res['title']}")
|
| 344 |
+
continue
|
| 345 |
+
else:
|
| 346 |
+
continue # Too short, require full name match
|
| 347 |
+
|
| 348 |
+
# Determine source type based on URL
|
| 349 |
+
source = "web"
|
| 350 |
+
if "glassdoor" in res['url']: source = "Glassdoor"
|
| 351 |
+
elif "twitter" in res['url'] or "x.com" in res['url']: source = "Twitter"
|
| 352 |
+
elif "linkedin" in res['url']: source = "LinkedIn"
|
| 353 |
+
elif "reddit" in res['url']: source = "Reddit"
|
| 354 |
+
elif "trustpilot" in res['url']: source = "Trustpilot"
|
| 355 |
+
|
| 356 |
+
# Clean title
|
| 357 |
+
title = res['title'].replace(" | Glassdoor", "").replace(" | Reddit", "")
|
| 358 |
+
|
| 359 |
+
reviews.append({
|
| 360 |
+
"url": res['url'],
|
| 361 |
+
"title": title,
|
| 362 |
+
"content": res['content'], # Use the snippet as the review content
|
| 363 |
+
"source_type": source
|
| 364 |
+
})
|
| 365 |
+
|
| 366 |
+
await asyncio.sleep(1)
|
| 367 |
+
|
| 368 |
+
# If few reviews found, try a broader fallback
|
| 369 |
+
if len(reviews) < 3:
|
| 370 |
+
report_progress("Few reviews found, trying specific broader query...", 75)
|
| 371 |
+
fallback_results = await search_web(f'"{company_name}" reviews environment', max_results=5)
|
| 372 |
+
for res in fallback_results:
|
| 373 |
+
if is_valid_result(res) and not any(r['url'] == res['url'] for r in reviews):
|
| 374 |
+
# RELEVANCE CHECK
|
| 375 |
+
c_name_lower = company_name.lower()
|
| 376 |
+
res_content = (res.get('title', '') + " " + res.get('content', '')).lower()
|
| 377 |
+
if c_name_lower not in res_content and c_name_lower.split()[0] not in res_content:
|
| 378 |
+
if len(c_name_lower.split()[0]) > 3:
|
| 379 |
+
if c_name_lower.split()[0] not in res_content: continue
|
| 380 |
+
else: continue
|
| 381 |
+
|
| 382 |
+
reviews.append({
|
| 383 |
+
"url": res['url'],
|
| 384 |
+
"title": res['title'],
|
| 385 |
+
"content": res['content'],
|
| 386 |
+
"source_type": "Web Search"
|
| 387 |
+
})
|
| 388 |
+
|
| 389 |
+
report_progress(f"Review collection complete: {len(reviews)} reviews", 80)
|
| 390 |
+
return reviews
|
| 391 |
+
|
| 392 |
+
# NO MOCK DATA FALLBACK
|
| 393 |
+
return reviews
|
binary_to_report_name_mapping.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:11dd0280ff81b2d788bfdd2a3a44071c0b1ef7c8747e82c39220e3a776a9c2a1
|
| 3 |
+
size 74
|
category_to_greenwashing_mapping.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a24eec4ecfb676159ea79d3f645867058917b1655d594351f7d049c9b51c6740
|
| 3 |
+
size 44
|
ensemble_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:354f730c99ba19e50a0e0a26bfd214906485401866b8c748995ba10d66b19fc6
|
| 3 |
+
size 246560
|
ml_models/all_feature_columns.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8f0b1ae01441008b1d591702001ef5da622b49120de397b6aefe19131d2fb9cb
|
| 3 |
+
size 219
|
ml_models/binary_to_report_name_mapping.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:11dd0280ff81b2d788bfdd2a3a44071c0b1ef7c8747e82c39220e3a776a9c2a1
|
| 3 |
+
size 74
|
ml_models/category_to_greenwashing_mapping.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a24eec4ecfb676159ea79d3f645867058917b1655d594351f7d049c9b51c6740
|
| 3 |
+
size 44
|
ml_models/ensemble_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:354f730c99ba19e50a0e0a26bfd214906485401866b8c748995ba10d66b19fc6
|
| 3 |
+
size 246560
|