""" ESG Report Analyser — working prototype for HuggingFace Spaces No ML models. No vector DB. Just pdfplumber + Gradio. Fully functional. """ import gradio as gr import re import json from pathlib import Path from collections import Counter # ───────────────────────────────────────────────────────────────────────────── # CONFIG # ───────────────────────────────────────────────────────────────────────────── GREENWASHING_KW = [ "carbon neutral", "net-zero", "net zero", "zero emissions", "100% renewable", "carbon offset", "zero waste", "eco-friendly", "fully sustainable", "nature positive", "carbon negative", "climate positive", "green certified", "biodegradable" ] ESG = { "Environmental": ["carbon","emission","climate","renewable","energy","water", "waste","pollution","solar","wind","biodiversity","greenhouse", "deforestation","recycl","fossil"], "Social": ["employee","diversity","inclusion","health","safety", "human rights","labour","labor","gender","community", "training","wellbeing","wage","stakeholder"], "Governance": ["board","audit","compliance","ethics","transparent", "corruption","disclosure","regulation","policy", "shareholder","executive","accountability","risk"] } SECTORS = { "Energy & Utilities": ["oil","gas","electricity","utility","power plant"], "Finance & Banking": ["bank","investment","portfolio","loan","insurance"], "Technology": ["software","data center","cloud","semiconductor"], "Manufacturing": ["factory","manufacturing","production","supply chain"], "Consumer Goods": ["retail","consumer","packaging","brand","fmcg"], "Healthcare": ["pharmaceutical","medical","hospital","clinical"], "Agriculture & Food": ["agriculture","food","farming","crop","livestock"], "Transportation": ["aviation","shipping","fleet","transport","logistics"], } # ───────────────────────────────────────────────────────────────────────────── # STATE # ───────────────────────────────────────────────────────────────────────────── doc = {"pages": [], "text": "", "name": ""} # always reset on new upload # ───────────────────────────────────────────────────────────────────────────── # PDF PARSING # ───────────────────────────────────────────────────────────────────────────── def parse_pdf(path): import pdfplumber pages = [] with pdfplumber.open(path) as pdf: for i, p in enumerate(pdf.pages): t = (p.extract_text() or "").strip() if t: pages.append({"page": i + 1, "text": t}) return pages # ───────────────────────────────────────────────────────────────────────────── # SEARCH (simple sentence-level keyword ranking — no model needed) # ───────────────────────────────────────────────────────────────────────────── def search(query, pages, top_k=5): """Split every page into sentences, score by query word overlap, return best.""" q_words = set(re.sub(r"[^\w\s]", "", query.lower()).split()) candidates = [] for pg in pages: # split on period / newline sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"]) for sent in sentences: if len(sent.split()) < 5: continue score = sum(sent.lower().count(w) for w in q_words) if score > 0: candidates.append({"page": pg["page"], "text": sent.strip(), "score": score}) candidates.sort(key=lambda x: -x["score"]) # deduplicate by first 60 chars seen, out = set(), [] for c in candidates: key = c["text"][:60] if key not in seen: seen.add(key) out.append(c) if len(out) == top_k: break return out # ───────────────────────────────────────────────────────────────────────────── # ANALYSIS HELPERS # ───────────────────────────────────────────────────────────────────────────── def esg_scores(text): t = text.lower() raw = {k: sum(t.count(w) for w in ws) for k, ws in ESG.items()} total = sum(raw.values()) or 1 return {k: round(v / total * 100, 1) for k, v in raw.items()} def detect_sector(text): t = text.lower() hits = {s: sum(t.count(w) for w in ws) for s, ws in SECTORS.items()} best = max(hits, key=hits.get) return best if hits[best] > 0 else "General / Diversified" def greenwash_flags(pages): flags, seen = [], set() for pg in pages: t = pg["text"].lower() matched = [kw for kw in GREENWASHING_KW if kw in t] for kw in matched: if (pg["page"], kw) not in seen: seen.add((pg["page"], kw)) # grab the sentence containing the keyword sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"]) snip = next((s for s in sentences if kw in s.lower()), pg["text"][:180]) verified = any(w in t for w in ["certified","verified","audited","third party","sbti","independently"]) flags.append({"page": pg["page"], "kw": kw, "snip": snip[:220], "ok": verified}) return flags def classify_sentence(s): t = s.lower() if any(k in t for k in GREENWASHING_KW): return "claim" if any(k in t for k in ["%","tonne","kwh","mwh","litre","gallon"]): return "evidence" if any(k in t for k in ["target","goal","by 2030","by 2050","we will","commit"]): return "policy" if any(k in t for k in ["kpi","metric","indicator","index"]): return "metric" return "context" def build_graph_summary(pages): role_counts = Counter() edges = {"follows": 0, "claim→evidence": 0, "policy→metric": 0} prev_role = None for pg in pages: sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"]) for sent in sentences: if len(sent.split()) < 4: continue r = classify_sentence(sent) role_counts[r] += 1 if prev_role: edges["follows"] += 1 if prev_role == "claim" and r == "evidence": edges["claim→evidence"] += 1 if prev_role == "policy" and r == "metric": edges["policy→metric"] += 1 prev_role = r return role_counts, edges # ───────────────────────────────────────────────────────────────────────────── # GRADIO HANDLERS # ───────────────────────────────────────────────────────────────────────────── def handle_upload(pdf): if pdf is None: return "⚠️ Upload a PDF file." try: pages = parse_pdf(pdf.name) if not pages: return "❌ No text found. Make sure the PDF is not a scanned image." doc["pages"] = pages doc["text"] = " ".join(p["text"] for p in pages) doc["name"] = Path(pdf.name).name role_c, _ = build_graph_summary(pages) return ( f"✅ **{doc['name']}** loaded\n\n" f"- **{len(pages)} pages** parsed\n" f"- **{sum(role_c.values())} sentences** analysed\n" f"- Node roles: `{dict(role_c)}`\n\n" "Use the tabs above to explore the report." ) except Exception as e: return f"❌ Error: {e}" def handle_qa(question): if not doc["pages"]: return "⚠️ Upload a document first.", "" if not question.strip(): return "⚠️ Type a question.", "" hits = search(question, doc["pages"]) if not hits: return "Nothing relevant found. Try different keywords.", "" answer = f"### Answer — *{doc['name']}*\n\n" for h in hits: answer += f"**Page {h['page']}:** {h['text']}\n\n" evidence = "### 📎 Matched Sentences\n\n" for i, h in enumerate(hits, 1): r = classify_sentence(h["text"]) evidence += f"**[{i}] Page {h['page']} · role `{r}` · score {h['score']}**\n> {h['text']}\n\n" return answer, evidence def handle_scores(): if not doc["pages"]: return "⚠️ Upload a document first." scores = esg_scores(doc["text"]) sector = detect_sector(doc["text"]) overall = round(sum(scores.values()) / 3, 1) def bar(v): f = min(int(v / 5), 20) return "█" * f + "░" * (20 - f) icons = {"Environmental": "🌿", "Social": "👥", "Governance": "🏛️"} rows = "\n".join( f"| {icons[k]} {k} | {v}% | `{bar(v)}` |" for k, v in scores.items() ) return ( f"## 📊 ESG Scores — *{doc['name']}*\n\n" f"| Pillar | Score | Bar |\n|--------|-------|-----|\n{rows}\n" f"| ⭐ Overall | **{overall}%** | `{bar(overall)}` |\n\n" f"**Sector detected:** {sector}\n\n" "> Scores reflect keyword frequency across the report." ) def handle_greenwash(): if not doc["pages"]: return "⚠️ Upload a document first." flags = greenwash_flags(doc["pages"]) if not flags: return "✅ No greenwashing keywords detected in this document." bad = [f for f in flags if not f["ok"]] good = [f for f in flags if f["ok"]] out = [f"## 🚨 Greenwashing Scan — *{doc['name']}*\n", f"**{len(bad)} unverified ⚠️**  |  **{len(good)} evidenced ✅**\n\n---\n"] if bad: out.append("### ⚠️ Unverified Claims\n") for f in bad: out.append(f"📍 **Page {f['page']}** — `{f['kw']}`\n> {f['snip']}\n") if good: out.append("\n### ✅ Claims With Supporting Evidence\n") for f in good: out.append(f"📍 **Page {f['page']}** — `{f['kw']}`\n> {f['snip']}\n") return "\n".join(out) def handle_graph(): if not doc["pages"]: return "⚠️ Upload a document first." role_c, edges = build_graph_summary(doc["pages"]) total_nodes = sum(role_c.values()) total_edges = sum(edges.values()) role_rows = "\n".join( f"| `{r}` | {n} | {round(n/total_nodes*100,1)}% |" for r, n in role_c.most_common() ) edge_rows = "\n".join(f"| `{e}` | {n} |" for e, n in edges.items()) return ( f"## 🕸️ Discourse Graph — *{doc['name']}*\n\n" f"**{total_nodes} nodes** (sentences) · **{total_edges} edges**\n\n" f"### Node Roles\n| Role | Count | Share |\n|------|-------|-------|\n{role_rows}\n\n" f"### Edge Types\n| Relation | Count |\n|----------|-------|\n{edge_rows}\n\n" "**How edges are inferred:**\n" "- Every consecutive sentence pair → `follows`\n" "- `claim` followed by `evidence` → `claim→evidence`\n" "- `policy` followed by `metric` → `policy→metric`\n\n" "> These relations power multi-hop retrieval: a question hitting a **claim** node " "automatically expands to its linked **evidence** nodes." ) # ───────────────────────────────────────────────────────────────────────────── # UI # ───────────────────────────────────────────────────────────────────────────── with gr.Blocks(title="ESG Analyser") as demo: gr.Markdown( "# 🌿 ESG Report Analyser\n" "Upload a sustainability / ESG report PDF and explore it instantly." ) with gr.Tab("📤 Upload"): up_file = gr.File(label="ESG Report (PDF)", file_types=[".pdf"]) up_btn = gr.Button("Process Document", variant="primary") up_out = gr.Markdown("Upload a PDF above and click **Process Document**.") up_btn.click(handle_upload, up_file, up_out) with gr.Tab("💬 Q&A"): q_box = gr.Textbox(label="Ask anything about the report", placeholder="e.g. What are the carbon reduction targets?") q_btn = gr.Button("Ask", variant="primary") q_ans = gr.Markdown() q_ev = gr.Markdown() gr.Examples([ ["What are the Scope 1 and 2 emissions?"], ["What diversity and inclusion initiatives are mentioned?"], ["What renewable energy commitments has the company made?"], ["What governance and audit policies are described?"], ["How does the company manage supply chain risks?"], ], inputs=q_box) q_btn.click(handle_qa, q_box, [q_ans, q_ev]) with gr.Tab("📊 ESG Scores"): s_btn = gr.Button("Compute ESG Scores", variant="primary") s_out = gr.Markdown() s_btn.click(handle_scores, outputs=s_out) with gr.Tab("🚨 Greenwashing"): g_btn = gr.Button("Scan for Greenwashing", variant="primary") g_out = gr.Markdown() g_btn.click(handle_greenwash, outputs=g_out) with gr.Tab("🕸️ Graph"): d_btn = gr.Button("Build Discourse Graph", variant="primary") d_out = gr.Markdown() d_btn.click(handle_graph, outputs=d_out) demo.launch()