Spaces:
Sleeping
Sleeping
| """ | |
| ESG Report Analyser β working prototype for HuggingFace Spaces | |
| No ML models. No vector DB. Just pdfplumber + Gradio. Fully functional. | |
| """ | |
| import gradio as gr | |
| import re | |
| import json | |
| from pathlib import Path | |
| from collections import Counter | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONFIG | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| GREENWASHING_KW = [ | |
| "carbon neutral", "net-zero", "net zero", "zero emissions", | |
| "100% renewable", "carbon offset", "zero waste", "eco-friendly", | |
| "fully sustainable", "nature positive", "carbon negative", | |
| "climate positive", "green certified", "biodegradable" | |
| ] | |
| ESG = { | |
| "Environmental": ["carbon","emission","climate","renewable","energy","water", | |
| "waste","pollution","solar","wind","biodiversity","greenhouse", | |
| "deforestation","recycl","fossil"], | |
| "Social": ["employee","diversity","inclusion","health","safety", | |
| "human rights","labour","labor","gender","community", | |
| "training","wellbeing","wage","stakeholder"], | |
| "Governance": ["board","audit","compliance","ethics","transparent", | |
| "corruption","disclosure","regulation","policy", | |
| "shareholder","executive","accountability","risk"] | |
| } | |
| SECTORS = { | |
| "Energy & Utilities": ["oil","gas","electricity","utility","power plant"], | |
| "Finance & Banking": ["bank","investment","portfolio","loan","insurance"], | |
| "Technology": ["software","data center","cloud","semiconductor"], | |
| "Manufacturing": ["factory","manufacturing","production","supply chain"], | |
| "Consumer Goods": ["retail","consumer","packaging","brand","fmcg"], | |
| "Healthcare": ["pharmaceutical","medical","hospital","clinical"], | |
| "Agriculture & Food": ["agriculture","food","farming","crop","livestock"], | |
| "Transportation": ["aviation","shipping","fleet","transport","logistics"], | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STATE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| doc = {"pages": [], "text": "", "name": ""} # always reset on new upload | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PDF PARSING | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def parse_pdf(path): | |
| import pdfplumber | |
| pages = [] | |
| with pdfplumber.open(path) as pdf: | |
| for i, p in enumerate(pdf.pages): | |
| t = (p.extract_text() or "").strip() | |
| if t: | |
| pages.append({"page": i + 1, "text": t}) | |
| return pages | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SEARCH (simple sentence-level keyword ranking β no model needed) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def search(query, pages, top_k=5): | |
| """Split every page into sentences, score by query word overlap, return best.""" | |
| q_words = set(re.sub(r"[^\w\s]", "", query.lower()).split()) | |
| candidates = [] | |
| for pg in pages: | |
| # split on period / newline | |
| sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"]) | |
| for sent in sentences: | |
| if len(sent.split()) < 5: | |
| continue | |
| score = sum(sent.lower().count(w) for w in q_words) | |
| if score > 0: | |
| candidates.append({"page": pg["page"], "text": sent.strip(), "score": score}) | |
| candidates.sort(key=lambda x: -x["score"]) | |
| # deduplicate by first 60 chars | |
| seen, out = set(), [] | |
| for c in candidates: | |
| key = c["text"][:60] | |
| if key not in seen: | |
| seen.add(key) | |
| out.append(c) | |
| if len(out) == top_k: | |
| break | |
| return out | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ANALYSIS HELPERS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def esg_scores(text): | |
| t = text.lower() | |
| raw = {k: sum(t.count(w) for w in ws) for k, ws in ESG.items()} | |
| total = sum(raw.values()) or 1 | |
| return {k: round(v / total * 100, 1) for k, v in raw.items()} | |
| def detect_sector(text): | |
| t = text.lower() | |
| hits = {s: sum(t.count(w) for w in ws) for s, ws in SECTORS.items()} | |
| best = max(hits, key=hits.get) | |
| return best if hits[best] > 0 else "General / Diversified" | |
| def greenwash_flags(pages): | |
| flags, seen = [], set() | |
| for pg in pages: | |
| t = pg["text"].lower() | |
| matched = [kw for kw in GREENWASHING_KW if kw in t] | |
| for kw in matched: | |
| if (pg["page"], kw) not in seen: | |
| seen.add((pg["page"], kw)) | |
| # grab the sentence containing the keyword | |
| sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"]) | |
| snip = next((s for s in sentences if kw in s.lower()), pg["text"][:180]) | |
| verified = any(w in t for w in ["certified","verified","audited","third party","sbti","independently"]) | |
| flags.append({"page": pg["page"], "kw": kw, "snip": snip[:220], "ok": verified}) | |
| return flags | |
| def classify_sentence(s): | |
| t = s.lower() | |
| if any(k in t for k in GREENWASHING_KW): return "claim" | |
| if any(k in t for k in ["%","tonne","kwh","mwh","litre","gallon"]): return "evidence" | |
| if any(k in t for k in ["target","goal","by 2030","by 2050","we will","commit"]): return "policy" | |
| if any(k in t for k in ["kpi","metric","indicator","index"]): return "metric" | |
| return "context" | |
| def build_graph_summary(pages): | |
| role_counts = Counter() | |
| edges = {"follows": 0, "claimβevidence": 0, "policyβmetric": 0} | |
| prev_role = None | |
| for pg in pages: | |
| sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"]) | |
| for sent in sentences: | |
| if len(sent.split()) < 4: | |
| continue | |
| r = classify_sentence(sent) | |
| role_counts[r] += 1 | |
| if prev_role: | |
| edges["follows"] += 1 | |
| if prev_role == "claim" and r == "evidence": | |
| edges["claimβevidence"] += 1 | |
| if prev_role == "policy" and r == "metric": | |
| edges["policyβmetric"] += 1 | |
| prev_role = r | |
| return role_counts, edges | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GRADIO HANDLERS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def handle_upload(pdf): | |
| if pdf is None: | |
| return "β οΈ Upload a PDF file." | |
| try: | |
| pages = parse_pdf(pdf.name) | |
| if not pages: | |
| return "β No text found. Make sure the PDF is not a scanned image." | |
| doc["pages"] = pages | |
| doc["text"] = " ".join(p["text"] for p in pages) | |
| doc["name"] = Path(pdf.name).name | |
| role_c, _ = build_graph_summary(pages) | |
| return ( | |
| f"β **{doc['name']}** loaded\n\n" | |
| f"- **{len(pages)} pages** parsed\n" | |
| f"- **{sum(role_c.values())} sentences** analysed\n" | |
| f"- Node roles: `{dict(role_c)}`\n\n" | |
| "Use the tabs above to explore the report." | |
| ) | |
| except Exception as e: | |
| return f"β Error: {e}" | |
| def handle_qa(question): | |
| if not doc["pages"]: | |
| return "β οΈ Upload a document first.", "" | |
| if not question.strip(): | |
| return "β οΈ Type a question.", "" | |
| hits = search(question, doc["pages"]) | |
| if not hits: | |
| return "Nothing relevant found. Try different keywords.", "" | |
| answer = f"### Answer β *{doc['name']}*\n\n" | |
| for h in hits: | |
| answer += f"**Page {h['page']}:** {h['text']}\n\n" | |
| evidence = "### π Matched Sentences\n\n" | |
| for i, h in enumerate(hits, 1): | |
| r = classify_sentence(h["text"]) | |
| evidence += f"**[{i}] Page {h['page']} Β· role `{r}` Β· score {h['score']}**\n> {h['text']}\n\n" | |
| return answer, evidence | |
| def handle_scores(): | |
| if not doc["pages"]: | |
| return "β οΈ Upload a document first." | |
| scores = esg_scores(doc["text"]) | |
| sector = detect_sector(doc["text"]) | |
| overall = round(sum(scores.values()) / 3, 1) | |
| def bar(v): | |
| f = min(int(v / 5), 20) | |
| return "β" * f + "β" * (20 - f) | |
| icons = {"Environmental": "πΏ", "Social": "π₯", "Governance": "ποΈ"} | |
| rows = "\n".join( | |
| f"| {icons[k]} {k} | {v}% | `{bar(v)}` |" | |
| for k, v in scores.items() | |
| ) | |
| return ( | |
| f"## π ESG Scores β *{doc['name']}*\n\n" | |
| f"| Pillar | Score | Bar |\n|--------|-------|-----|\n{rows}\n" | |
| f"| β Overall | **{overall}%** | `{bar(overall)}` |\n\n" | |
| f"**Sector detected:** {sector}\n\n" | |
| "> Scores reflect keyword frequency across the report." | |
| ) | |
| def handle_greenwash(): | |
| if not doc["pages"]: | |
| return "β οΈ Upload a document first." | |
| flags = greenwash_flags(doc["pages"]) | |
| if not flags: | |
| return "β No greenwashing keywords detected in this document." | |
| bad = [f for f in flags if not f["ok"]] | |
| good = [f for f in flags if f["ok"]] | |
| out = [f"## π¨ Greenwashing Scan β *{doc['name']}*\n", | |
| f"**{len(bad)} unverified β οΈ** | **{len(good)} evidenced β **\n\n---\n"] | |
| if bad: | |
| out.append("### β οΈ Unverified Claims\n") | |
| for f in bad: | |
| out.append(f"π **Page {f['page']}** β `{f['kw']}`\n> {f['snip']}\n") | |
| if good: | |
| out.append("\n### β Claims With Supporting Evidence\n") | |
| for f in good: | |
| out.append(f"π **Page {f['page']}** β `{f['kw']}`\n> {f['snip']}\n") | |
| return "\n".join(out) | |
| def handle_graph(): | |
| if not doc["pages"]: | |
| return "β οΈ Upload a document first." | |
| role_c, edges = build_graph_summary(doc["pages"]) | |
| total_nodes = sum(role_c.values()) | |
| total_edges = sum(edges.values()) | |
| role_rows = "\n".join( | |
| f"| `{r}` | {n} | {round(n/total_nodes*100,1)}% |" | |
| for r, n in role_c.most_common() | |
| ) | |
| edge_rows = "\n".join(f"| `{e}` | {n} |" for e, n in edges.items()) | |
| return ( | |
| f"## πΈοΈ Discourse Graph β *{doc['name']}*\n\n" | |
| f"**{total_nodes} nodes** (sentences) Β· **{total_edges} edges**\n\n" | |
| f"### Node Roles\n| Role | Count | Share |\n|------|-------|-------|\n{role_rows}\n\n" | |
| f"### Edge Types\n| Relation | Count |\n|----------|-------|\n{edge_rows}\n\n" | |
| "**How edges are inferred:**\n" | |
| "- Every consecutive sentence pair β `follows`\n" | |
| "- `claim` followed by `evidence` β `claimβevidence`\n" | |
| "- `policy` followed by `metric` β `policyβmetric`\n\n" | |
| "> These relations power multi-hop retrieval: a question hitting a **claim** node " | |
| "automatically expands to its linked **evidence** nodes." | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="ESG Analyser") as demo: | |
| gr.Markdown( | |
| "# πΏ ESG Report Analyser\n" | |
| "Upload a sustainability / ESG report PDF and explore it instantly." | |
| ) | |
| with gr.Tab("π€ Upload"): | |
| up_file = gr.File(label="ESG Report (PDF)", file_types=[".pdf"]) | |
| up_btn = gr.Button("Process Document", variant="primary") | |
| up_out = gr.Markdown("Upload a PDF above and click **Process Document**.") | |
| up_btn.click(handle_upload, up_file, up_out) | |
| with gr.Tab("π¬ Q&A"): | |
| q_box = gr.Textbox(label="Ask anything about the report", | |
| placeholder="e.g. What are the carbon reduction targets?") | |
| q_btn = gr.Button("Ask", variant="primary") | |
| q_ans = gr.Markdown() | |
| q_ev = gr.Markdown() | |
| gr.Examples([ | |
| ["What are the Scope 1 and 2 emissions?"], | |
| ["What diversity and inclusion initiatives are mentioned?"], | |
| ["What renewable energy commitments has the company made?"], | |
| ["What governance and audit policies are described?"], | |
| ["How does the company manage supply chain risks?"], | |
| ], inputs=q_box) | |
| q_btn.click(handle_qa, q_box, [q_ans, q_ev]) | |
| with gr.Tab("π ESG Scores"): | |
| s_btn = gr.Button("Compute ESG Scores", variant="primary") | |
| s_out = gr.Markdown() | |
| s_btn.click(handle_scores, outputs=s_out) | |
| with gr.Tab("π¨ Greenwashing"): | |
| g_btn = gr.Button("Scan for Greenwashing", variant="primary") | |
| g_out = gr.Markdown() | |
| g_btn.click(handle_greenwash, outputs=g_out) | |
| with gr.Tab("πΈοΈ Graph"): | |
| d_btn = gr.Button("Build Discourse Graph", variant="primary") | |
| d_out = gr.Markdown() | |
| d_btn.click(handle_graph, outputs=d_out) | |
| demo.launch() |