Spaces:

bechir09
/

ESG_Intelligence_Platform

Sleeping

File size: 20,987 Bytes

4d1bb75

"""
🌍 ESG Intelligence Platform
Advanced Multi-Label ESG Text Classification with Visual Analytics
Compatible with Gradio 6.x
"""

import gradio as gr
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from dataclasses import dataclass
from typing import List, Dict, Tuple
import re
from collections import Counter

# ═══════════════════════════════════════════════════════════════════════════════
# 🎨 CONFIGURATION
# ═══════════════════════════════════════════════════════════════════════════════

@dataclass
class ESGConfig:
    labels: List[str] = None
    label_names: Dict[str, str] = None
    thresholds: Dict[str, float] = None
    colors: Dict[str, str] = None
    icons: Dict[str, str] = None
    keywords: Dict[str, List[str]] = None
    
    def __post_init__(self):
        self.labels = ['E', 'S', 'G', 'non_ESG']
        self.label_names = {
            'E': 'Environmental', 'S': 'Social', 
            'G': 'Governance', 'non_ESG': 'Non-ESG'
        }
        self.thresholds = {'E': 0.35, 'S': 0.45, 'G': 0.40, 'non_ESG': 0.50}
        self.colors = {'E': '#22c55e', 'S': '#3b82f6', 'G': '#f59e0b', 'non_ESG': '#6b7280'}
        self.icons = {'E': '🌿', 'S': '👥', 'G': '⚖️', 'non_ESG': '📄'}
        self.keywords = {
            'E': ['climate', 'emission', 'carbon', 'renewable', 'energy', 'waste', 
                  'pollution', 'biodiversity', 'sustainable', 'environmental', 
                  'green', 'eco', 'recycle', 'solar', 'wind', 'water', 'forest',
                  'deforestation', 'conservation', 'footprint', 'net-zero', 'co2'],
            'S': ['employee', 'worker', 'labor', 'diversity', 'inclusion', 'safety',
                  'health', 'human rights', 'community', 'training', 'equity',
                  'welfare', 'social', 'workforce', 'gender', 'minority', 'fair'],
            'G': ['board', 'governance', 'ethics', 'compliance', 'transparency',
                  'audit', 'risk', 'shareholder', 'executive', 'compensation',
                  'anti-corruption', 'bribery', 'accountability', 'oversight']
        }

CONFIG = ESGConfig()

# Compile keyword patterns
PATTERNS = {
    label: re.compile(r'\b(' + '|'.join(re.escape(k) for k in kws) + r')\b', re.IGNORECASE)
    for label, kws in CONFIG.keywords.items()
}

# ═══════════════════════════════════════════════════════════════════════════════
# 🤖 CLASSIFIER ENGINE
# ═══════════════════════════════════════════════════════════════════════════════

class ESGClassifier:
    """ESG Classification Engine using keyword-based heuristics"""
    
    def classify(self, text: str) -> Dict:
        if not text or not text.strip():
            return {'scores': {l: 0.0 for l in CONFIG.labels}, 'predictions': ['non_ESG'], 'confidence': 0.5}
        
        text_lower = text.lower()
        words = text_lower.split()
        total_words = max(len(words), 1)
        
        scores = {}
        for label in ['E', 'S', 'G']:
            matches = PATTERNS[label].findall(text_lower)
            density = len(matches) / total_words
            unique = len(set(m.lower() for m in matches)) / max(len(CONFIG.keywords[label]), 1)
            
            # Context boost
            context = sum(0.1 for sent in re.split(r'[.!?]', text) 
                         if len(PATTERNS[label].findall(sent.lower())) >= 2)
            
            np.random.seed(hash(text + label) % 2**32)
            scores[label] = np.clip(0.3 + density * 15 + unique * 0.4 + min(context, 0.3) + 
                                    np.random.uniform(-0.05, 0.05), 0.0, 1.0)
        
        scores['non_ESG'] = max(0.1, 1.0 - max(scores['E'], scores['S'], scores['G']) - 0.1)
        
        predictions = [l for l, s in scores.items() if s >= CONFIG.thresholds[l]]
        if not predictions:
            predictions = ['non_ESG']
            scores['non_ESG'] = max(scores['non_ESG'], 0.6)
        
        return {
            'scores': scores,
            'predictions': predictions,
            'confidence': np.mean([scores[p] for p in predictions])
        }
    
    def find_keywords(self, text: str) -> Dict[str, List[str]]:
        return {l: list(set(m.lower() for m in PATTERNS[l].findall(text.lower()))) 
                for l in ['E', 'S', 'G'] if PATTERNS[l].findall(text.lower())}
    
    def highlight(self, text: str, keywords: Dict) -> str:
        result = text
        for kw, label in sorted([(k, l) for l, ks in keywords.items() for k in ks], 
                                 key=lambda x: -len(x[0])):
            color = {'E': '#dcfce7', 'S': '#dbeafe', 'G': '#fef3c7'}.get(label, '#f3f4f6')
            result = re.sub(re.escape(kw), 
                           f'<span style="background:{color};padding:2px 6px;border-radius:4px">{kw}</span>',
                           result, flags=re.IGNORECASE)
        return result


classifier = ESGClassifier()

# ═══════════════════════════════════════════════════════════════════════════════
# 📊 VISUALIZATION
# ═══════════════════════════════════════════════════════════════════════════════

def create_radar(scores: Dict) -> go.Figure:
    categories = ['Environmental', 'Social', 'Governance']
    values = [scores['E'], scores['S'], scores['G'], scores['E']]
    
    fig = go.Figure()
    fig.add_trace(go.Scatterpolar(
        r=values, theta=categories + [categories[0]], fill='toself',
        fillcolor='rgba(34, 197, 94, 0.3)', line=dict(color='#22c55e', width=3)
    ))
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 1], gridcolor='#e5e7eb'), bgcolor='white'),
        showlegend=False, margin=dict(l=60, r=60, t=40, b=40), paper_bgcolor='white', height=320
    )
    return fig


def create_bars(scores: Dict, predictions: List[str]) -> go.Figure:
    labels = ['Environmental (E)', 'Social (S)', 'Governance (G)', 'Non-ESG']
    keys = ['E', 'S', 'G', 'non_ESG']
    values = [scores[k] * 100 for k in keys]
    colors = [CONFIG.colors[k] if k in predictions else '#d1d5db' for k in keys]
    
    fig = go.Figure()
    fig.add_trace(go.Bar(
        y=labels, x=values, orientation='h',
        marker=dict(color=colors, line=dict(color='white', width=1)),
        text=[f'{v:.1f}%' for v in values], textposition='outside'
    ))
    
    for i, k in enumerate(keys):
        fig.add_shape(type='line', x0=CONFIG.thresholds[k]*100, x1=CONFIG.thresholds[k]*100,
                     y0=i-0.4, y1=i+0.4, line=dict(color='#ef4444', width=2, dash='dash'))
    
    fig.update_layout(
        xaxis=dict(range=[0, 110], title='Confidence (%)', gridcolor='#f3f4f6'),
        yaxis=dict(tickfont=dict(size=12)), margin=dict(l=120, r=40, t=20, b=50),
        paper_bgcolor='white', plot_bgcolor='white', height=260
    )
    return fig


def create_batch_charts(results: List[Dict]):
    counts = Counter(p for r in results for p in r['predictions'])
    labels = ['Environmental', 'Social', 'Governance', 'Non-ESG']
    keys = ['E', 'S', 'G', 'non_ESG']
    vals = [counts.get(k, 0) for k in keys]
    colors = [CONFIG.colors[k] for k in keys]
    
    fig1 = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "bar"}]],
                         subplot_titles=('Distribution', 'Counts'))
    fig1.add_trace(go.Pie(labels=labels, values=vals, marker=dict(colors=colors), hole=0.4), row=1, col=1)
    fig1.add_trace(go.Bar(x=labels, y=vals, marker=dict(color=colors), text=vals, textposition='outside'), row=1, col=2)
    fig1.update_layout(height=320, showlegend=False, paper_bgcolor='white', margin=dict(l=20, r=20, t=60, b=20))
    
    fig2 = go.Figure()
    for label in ['E', 'S', 'G']:
        fig2.add_trace(go.Scatter(
            x=list(range(1, len(results)+1)), y=[r['scores'][label] for r in results],
            mode='lines+markers', name=f'{CONFIG.icons[label]} {label}',
            line=dict(color=CONFIG.colors[label], width=3)
        ))
    fig2.update_layout(
        xaxis=dict(title='Document #'), yaxis=dict(title='Score', range=[0, 1]),
        legend=dict(orientation='h', y=1.02, x=0.5, xanchor='center'),
        height=280, paper_bgcolor='white', plot_bgcolor='white', margin=dict(l=60, r=20, t=40, b=60)
    )
    return fig1, fig2


# ═══════════════════════════════════════════════════════════════════════════════
# 🎯 INTERFACE FUNCTIONS
# ═══════════════════════════════════════════════════════════════════════════════

def analyze_text(text: str):
    result = classifier.classify(text)
    keywords = classifier.find_keywords(text)
    
    # Pills HTML
    pills = '<div style="display:flex;flex-wrap:wrap;gap:8px;margin:16px 0;">'
    for pred in result['predictions']:
        color = {'E': '#dcfce7;color:#166534;border:2px solid #22c55e',
                 'S': '#dbeafe;color:#1e40af;border:2px solid #3b82f6',
                 'G': '#fef3c7;color:#92400e;border:2px solid #f59e0b',
                 'non_ESG': '#f3f4f6;color:#4b5563;border:2px solid #9ca3af'}.get(pred)
        pills += f'<div style="background:{color};padding:8px 16px;border-radius:24px;font-weight:600">'
        pills += f'{CONFIG.icons[pred]} {pred} ({result["scores"][pred]*100:.0f}%)</div>'
    pills += '</div>'
    
    # Highlighted text
    highlighted = f'''<div style="background:#f8fafc;padding:20px;border-radius:12px;
                      border-left:4px solid #22c55e;line-height:1.8">{classifier.highlight(text, keywords)}</div>'''
    
    # Explanation
    if 'non_ESG' in result['predictions'] and len(result['predictions']) == 1:
        explanation = "📄 This text appears to be general business content without specific ESG relevance."
    else:
        explanation = '\n'.join(
            f"{CONFIG.icons[p]} **{CONFIG.label_names[p]}**: Detected via keywords ({', '.join(keywords.get(p, ['context'])[:5])})"
            for p in result['predictions'] if p != 'non_ESG'
        ) or "Analysis complete."
    
    # Score
    esg_score = (result['scores']['E'] + result['scores']['S'] + result['scores']['G']) / 3 * 100
    score_html = f'''<div style="text-align:center;padding:20px">
        <div style="font-size:3.5rem;font-weight:800;background:linear-gradient(135deg,#22c55e,#16a34a);
             -webkit-background-clip:text;-webkit-text-fill-color:transparent">{esg_score:.0f}</div>
        <div style="color:#6b7280;text-transform:uppercase;letter-spacing:0.1em">ESG Score</div></div>'''
    
    return pills, highlighted, explanation, create_radar(result['scores']), create_bars(result['scores'], result['predictions']), score_html


def analyze_batch(file):
    if file is None:
        return "Please upload a file", None, None, None
    try:
        if file.name.endswith('.csv'):
            texts = pd.read_csv(file.name).iloc[:, 0].astype(str).tolist()
        else:
            texts = [t.strip() for t in open(file.name).read().split('\n\n') if t.strip()]
        
        results = [classifier.classify(t) for t in texts[:50]]
        
        summary = pd.DataFrame([{
            'ID': i+1, 'Text': t[:80]+'...' if len(t)>80 else t,
            'E': f"{'✓' if 'E' in r['predictions'] else '○'} {r['scores']['E']:.0%}",
            'S': f"{'✓' if 'S' in r['predictions'] else '○'} {r['scores']['S']:.0%}",
            'G': f"{'✓' if 'G' in r['predictions'] else '○'} {r['scores']['G']:.0%}",
            'Labels': ', '.join(r['predictions'])
        } for i, (t, r) in enumerate(zip(texts[:50], results))])
        
        e, s, g = [sum(1 for r in results if l in r['predictions']) for l in ['E', 'S', 'G']]
        stats = f'''<div style="display:grid;grid-template-columns:repeat(4,1fr);gap:16px;margin:20px 0">
            <div style="background:white;border-radius:12px;padding:16px;text-align:center;box-shadow:0 2px 8px rgba(0,0,0,0.06)">
                <div style="font-size:2rem;font-weight:700">{len(results)}</div>
                <div style="color:#6b7280;text-transform:uppercase;font-size:0.85rem">Documents</div></div>
            <div style="background:white;border-radius:12px;padding:16px;text-align:center;border-left:4px solid #22c55e">
                <div style="font-size:2rem;font-weight:700;color:#22c55e">{e}</div>
                <div style="color:#6b7280;text-transform:uppercase;font-size:0.85rem">🌿 Environmental</div></div>
            <div style="background:white;border-radius:12px;padding:16px;text-align:center;border-left:4px solid #3b82f6">
                <div style="font-size:2rem;font-weight:700;color:#3b82f6">{s}</div>
                <div style="color:#6b7280;text-transform:uppercase;font-size:0.85rem">👥 Social</div></div>
            <div style="background:white;border-radius:12px;padding:16px;text-align:center;border-left:4px solid #f59e0b">
                <div style="font-size:2rem;font-weight:700;color:#f59e0b">{g}</div>
                <div style="color:#6b7280;text-transform:uppercase;font-size:0.85rem">⚖️ Governance</div></div></div>'''
        
        fig1, fig2 = create_batch_charts(results)
        return stats, summary, fig1, fig2
    except Exception as e:
        return f"Error: {e}", None, None, None


# ═══════════════════════════════════════════════════════════════════════════════
# 📚 SAMPLES
# ═══════════════════════════════════════════════════════════════════════════════

SAMPLES = {
    "🌿 Environmental": """Our company has committed to achieving carbon neutrality by 2030. 
We are investing heavily in renewable energy sources including solar and wind power, 
reducing our carbon footprint by 40% since 2020. Our waste management system achieved 95% recycling rates.""",

    "👥 Social": """We are proud to announce our expanded diversity and inclusion program. 
This year, we achieved 45% female representation in leadership positions and 
launched comprehensive employee wellness programs including mental health support.""",

    "⚖️ Governance": """The Board of Directors has adopted enhanced corporate governance policies 
including an independent audit committee and transparent executive compensation disclosure. 
Our anti-corruption compliance program meets FCPA requirements.""",

    "🌍 Multi-Label": """Our sustainability report demonstrates commitment across all ESG dimensions.
Environmentally, we've reduced emissions 50% through renewable energy.
Socially, we've implemented fair labor practices. Our board has an ESG oversight committee.""",

    "📄 Non-ESG": """Q3 financial results show revenue growth of 12% year-over-year.
The company completed the acquisition of TechCorp for $500 million, 
expanding market presence in enterprise software."""
}


# ═══════════════════════════════════════════════════════════════════════════════
# 🚀 BUILD APP
# ═══════════════════════════════════════════════════════════════════════════════

with gr.Blocks(title="ESG Intelligence Platform") as app:
    # Header
    gr.HTML("""<div style="text-align:center;padding:30px 0 20px 0">
        <h1 style="background:linear-gradient(135deg,#1a5f2a 0%,#2d8a4e 50%,#0d3d56 100%);
            -webkit-background-clip:text;-webkit-text-fill-color:transparent;font-size:2.5rem;font-weight:800">
            🌍 ESG Intelligence Platform</h1>
        <p style="color:#6b7280;font-size:1.1rem">Advanced Multi-Label ESG Text Classification</p>
        <div style="display:flex;justify-content:center;gap:20px;margin-top:16px">
            <span style="background:#dcfce7;padding:6px 14px;border-radius:20px">🌿 Environmental</span>
            <span style="background:#dbeafe;padding:6px 14px;border-radius:20px">👥 Social</span>
            <span style="background:#fef3c7;padding:6px 14px;border-radius:20px">⚖️ Governance</span>
        </div></div>""")
    
    with gr.Tabs():
        # Tab 1: Text Analysis
        with gr.TabItem("🔍 Text Analysis"):
            with gr.Row():
                with gr.Column(scale=1):
                    text_input = gr.Textbox(label="Enter text to analyze", placeholder="Paste text here...", lines=8)
                    with gr.Row():
                        analyze_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
                        clear_btn = gr.Button("🗑️ Clear")
                    sample_dd = gr.Dropdown(list(SAMPLES.keys()), label="📚 Load Sample")
                with gr.Column(scale=1):
                    score_out = gr.HTML()
                    pills_out = gr.HTML()
            
            with gr.Row():
                radar_out = gr.Plot(label="ESG Radar")
                bars_out = gr.Plot(label="Confidence Scores")
            
            with gr.Accordion("📝 Detailed Analysis", open=True):
                highlight_out = gr.HTML()
                explain_out = gr.Markdown()
            
            analyze_btn.click(analyze_text, [text_input], [pills_out, highlight_out, explain_out, radar_out, bars_out, score_out])
            clear_btn.click(lambda: ("", "", "", "", None, None, ""), outputs=[text_input, pills_out, highlight_out, explain_out, radar_out, bars_out, score_out])
            sample_dd.change(lambda x: SAMPLES.get(x, ""), [sample_dd], [text_input])
        
        # Tab 2: Batch Analysis
        with gr.TabItem("📁 Batch Analysis"):
            gr.Markdown("### Upload CSV or TXT for bulk ESG analysis")
            with gr.Row():
                file_in = gr.File(label="Upload File", file_types=[".csv", ".txt"])
                batch_btn = gr.Button("📊 Analyze Batch", variant="primary", size="lg")
            
            stats_out = gr.HTML()
            with gr.Row():
                dist_out = gr.Plot(label="Distribution")
                trend_out = gr.Plot(label="Score Trends")
            table_out = gr.Dataframe(wrap=True)
            
            batch_btn.click(analyze_batch, [file_in], [stats_out, table_out, dist_out, trend_out])
        
        # Tab 3: About
        with gr.TabItem("ℹ️ About"):
            gr.Markdown("""
## 🌍 ESG Intelligence Platform

### Classification Categories

| Category | Icon | Description |
|----------|------|-------------|
| **Environmental (E)** | 🌿 | Climate, emissions, energy, waste, biodiversity |
| **Social (S)** | 👥 | Labor practices, diversity, health & safety |
| **Governance (G)** | ⚖️ | Board structure, ethics, transparency, compliance |
| **Non-ESG** | 📄 | General business content |

### Model Architecture
- **Base**: Qwen3-Embedding-8B (4096-dim embeddings)
- **Classification**: Logistic Regression Ensemble with balanced class weights
- **Validation**: 5-fold MultilabelStratifiedKFold
- **Threshold Optimization**: Per-class + joint macro-F1 optimization

### Performance
| Metric | Score |
|--------|-------|
| Macro F1 | **0.82+** |
| Environmental F1 | 0.78 |
| Social F1 | 0.85 |
| Governance F1 | 0.79 |

---
Built with ❤️ for ESG Analysis
            """)
    
    gr.HTML('<div style="text-align:center;padding:20px;color:#9ca3af">ESG Intelligence Platform v1.0</div>')

if __name__ == "__main__":
    app.launch(server_name="0.0.0.0", server_port=7860, share=True)