GirishaBuilds01's picture
Update app.py
e4c527f verified
"""
ESG Report Analyser β€” working prototype for HuggingFace Spaces
No ML models. No vector DB. Just pdfplumber + Gradio. Fully functional.
"""
import gradio as gr
import re
import json
from pathlib import Path
from collections import Counter
# ─────────────────────────────────────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────────────────────────────────────
GREENWASHING_KW = [
"carbon neutral", "net-zero", "net zero", "zero emissions",
"100% renewable", "carbon offset", "zero waste", "eco-friendly",
"fully sustainable", "nature positive", "carbon negative",
"climate positive", "green certified", "biodegradable"
]
ESG = {
"Environmental": ["carbon","emission","climate","renewable","energy","water",
"waste","pollution","solar","wind","biodiversity","greenhouse",
"deforestation","recycl","fossil"],
"Social": ["employee","diversity","inclusion","health","safety",
"human rights","labour","labor","gender","community",
"training","wellbeing","wage","stakeholder"],
"Governance": ["board","audit","compliance","ethics","transparent",
"corruption","disclosure","regulation","policy",
"shareholder","executive","accountability","risk"]
}
SECTORS = {
"Energy & Utilities": ["oil","gas","electricity","utility","power plant"],
"Finance & Banking": ["bank","investment","portfolio","loan","insurance"],
"Technology": ["software","data center","cloud","semiconductor"],
"Manufacturing": ["factory","manufacturing","production","supply chain"],
"Consumer Goods": ["retail","consumer","packaging","brand","fmcg"],
"Healthcare": ["pharmaceutical","medical","hospital","clinical"],
"Agriculture & Food": ["agriculture","food","farming","crop","livestock"],
"Transportation": ["aviation","shipping","fleet","transport","logistics"],
}
# ─────────────────────────────────────────────────────────────────────────────
# STATE
# ─────────────────────────────────────────────────────────────────────────────
doc = {"pages": [], "text": "", "name": ""} # always reset on new upload
# ─────────────────────────────────────────────────────────────────────────────
# PDF PARSING
# ─────────────────────────────────────────────────────────────────────────────
def parse_pdf(path):
import pdfplumber
pages = []
with pdfplumber.open(path) as pdf:
for i, p in enumerate(pdf.pages):
t = (p.extract_text() or "").strip()
if t:
pages.append({"page": i + 1, "text": t})
return pages
# ─────────────────────────────────────────────────────────────────────────────
# SEARCH (simple sentence-level keyword ranking β€” no model needed)
# ─────────────────────────────────────────────────────────────────────────────
def search(query, pages, top_k=5):
"""Split every page into sentences, score by query word overlap, return best."""
q_words = set(re.sub(r"[^\w\s]", "", query.lower()).split())
candidates = []
for pg in pages:
# split on period / newline
sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"])
for sent in sentences:
if len(sent.split()) < 5:
continue
score = sum(sent.lower().count(w) for w in q_words)
if score > 0:
candidates.append({"page": pg["page"], "text": sent.strip(), "score": score})
candidates.sort(key=lambda x: -x["score"])
# deduplicate by first 60 chars
seen, out = set(), []
for c in candidates:
key = c["text"][:60]
if key not in seen:
seen.add(key)
out.append(c)
if len(out) == top_k:
break
return out
# ─────────────────────────────────────────────────────────────────────────────
# ANALYSIS HELPERS
# ─────────────────────────────────────────────────────────────────────────────
def esg_scores(text):
t = text.lower()
raw = {k: sum(t.count(w) for w in ws) for k, ws in ESG.items()}
total = sum(raw.values()) or 1
return {k: round(v / total * 100, 1) for k, v in raw.items()}
def detect_sector(text):
t = text.lower()
hits = {s: sum(t.count(w) for w in ws) for s, ws in SECTORS.items()}
best = max(hits, key=hits.get)
return best if hits[best] > 0 else "General / Diversified"
def greenwash_flags(pages):
flags, seen = [], set()
for pg in pages:
t = pg["text"].lower()
matched = [kw for kw in GREENWASHING_KW if kw in t]
for kw in matched:
if (pg["page"], kw) not in seen:
seen.add((pg["page"], kw))
# grab the sentence containing the keyword
sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"])
snip = next((s for s in sentences if kw in s.lower()), pg["text"][:180])
verified = any(w in t for w in ["certified","verified","audited","third party","sbti","independently"])
flags.append({"page": pg["page"], "kw": kw, "snip": snip[:220], "ok": verified})
return flags
def classify_sentence(s):
t = s.lower()
if any(k in t for k in GREENWASHING_KW): return "claim"
if any(k in t for k in ["%","tonne","kwh","mwh","litre","gallon"]): return "evidence"
if any(k in t for k in ["target","goal","by 2030","by 2050","we will","commit"]): return "policy"
if any(k in t for k in ["kpi","metric","indicator","index"]): return "metric"
return "context"
def build_graph_summary(pages):
role_counts = Counter()
edges = {"follows": 0, "claim→evidence": 0, "policy→metric": 0}
prev_role = None
for pg in pages:
sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"])
for sent in sentences:
if len(sent.split()) < 4:
continue
r = classify_sentence(sent)
role_counts[r] += 1
if prev_role:
edges["follows"] += 1
if prev_role == "claim" and r == "evidence":
edges["claim→evidence"] += 1
if prev_role == "policy" and r == "metric":
edges["policy→metric"] += 1
prev_role = r
return role_counts, edges
# ─────────────────────────────────────────────────────────────────────────────
# GRADIO HANDLERS
# ─────────────────────────────────────────────────────────────────────────────
def handle_upload(pdf):
if pdf is None:
return "⚠️ Upload a PDF file."
try:
pages = parse_pdf(pdf.name)
if not pages:
return "❌ No text found. Make sure the PDF is not a scanned image."
doc["pages"] = pages
doc["text"] = " ".join(p["text"] for p in pages)
doc["name"] = Path(pdf.name).name
role_c, _ = build_graph_summary(pages)
return (
f"βœ… **{doc['name']}** loaded\n\n"
f"- **{len(pages)} pages** parsed\n"
f"- **{sum(role_c.values())} sentences** analysed\n"
f"- Node roles: `{dict(role_c)}`\n\n"
"Use the tabs above to explore the report."
)
except Exception as e:
return f"❌ Error: {e}"
def handle_qa(question):
if not doc["pages"]:
return "⚠️ Upload a document first.", ""
if not question.strip():
return "⚠️ Type a question.", ""
hits = search(question, doc["pages"])
if not hits:
return "Nothing relevant found. Try different keywords.", ""
answer = f"### Answer β€” *{doc['name']}*\n\n"
for h in hits:
answer += f"**Page {h['page']}:** {h['text']}\n\n"
evidence = "### πŸ“Ž Matched Sentences\n\n"
for i, h in enumerate(hits, 1):
r = classify_sentence(h["text"])
evidence += f"**[{i}] Page {h['page']} Β· role `{r}` Β· score {h['score']}**\n> {h['text']}\n\n"
return answer, evidence
def handle_scores():
if not doc["pages"]:
return "⚠️ Upload a document first."
scores = esg_scores(doc["text"])
sector = detect_sector(doc["text"])
overall = round(sum(scores.values()) / 3, 1)
def bar(v):
f = min(int(v / 5), 20)
return "β–ˆ" * f + "β–‘" * (20 - f)
icons = {"Environmental": "🌿", "Social": "πŸ‘₯", "Governance": "πŸ›οΈ"}
rows = "\n".join(
f"| {icons[k]} {k} | {v}% | `{bar(v)}` |"
for k, v in scores.items()
)
return (
f"## πŸ“Š ESG Scores β€” *{doc['name']}*\n\n"
f"| Pillar | Score | Bar |\n|--------|-------|-----|\n{rows}\n"
f"| ⭐ Overall | **{overall}%** | `{bar(overall)}` |\n\n"
f"**Sector detected:** {sector}\n\n"
"> Scores reflect keyword frequency across the report."
)
def handle_greenwash():
if not doc["pages"]:
return "⚠️ Upload a document first."
flags = greenwash_flags(doc["pages"])
if not flags:
return "βœ… No greenwashing keywords detected in this document."
bad = [f for f in flags if not f["ok"]]
good = [f for f in flags if f["ok"]]
out = [f"## 🚨 Greenwashing Scan β€” *{doc['name']}*\n",
f"**{len(bad)} unverified ⚠️** &nbsp;|&nbsp; **{len(good)} evidenced βœ…**\n\n---\n"]
if bad:
out.append("### ⚠️ Unverified Claims\n")
for f in bad:
out.append(f"πŸ“ **Page {f['page']}** β€” `{f['kw']}`\n> {f['snip']}\n")
if good:
out.append("\n### βœ… Claims With Supporting Evidence\n")
for f in good:
out.append(f"πŸ“ **Page {f['page']}** β€” `{f['kw']}`\n> {f['snip']}\n")
return "\n".join(out)
def handle_graph():
if not doc["pages"]:
return "⚠️ Upload a document first."
role_c, edges = build_graph_summary(doc["pages"])
total_nodes = sum(role_c.values())
total_edges = sum(edges.values())
role_rows = "\n".join(
f"| `{r}` | {n} | {round(n/total_nodes*100,1)}% |"
for r, n in role_c.most_common()
)
edge_rows = "\n".join(f"| `{e}` | {n} |" for e, n in edges.items())
return (
f"## πŸ•ΈοΈ Discourse Graph β€” *{doc['name']}*\n\n"
f"**{total_nodes} nodes** (sentences) Β· **{total_edges} edges**\n\n"
f"### Node Roles\n| Role | Count | Share |\n|------|-------|-------|\n{role_rows}\n\n"
f"### Edge Types\n| Relation | Count |\n|----------|-------|\n{edge_rows}\n\n"
"**How edges are inferred:**\n"
"- Every consecutive sentence pair β†’ `follows`\n"
"- `claim` followed by `evidence` → `claim→evidence`\n"
"- `policy` followed by `metric` → `policy→metric`\n\n"
"> These relations power multi-hop retrieval: a question hitting a **claim** node "
"automatically expands to its linked **evidence** nodes."
)
# ─────────────────────────────────────────────────────────────────────────────
# UI
# ─────────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="ESG Analyser") as demo:
gr.Markdown(
"# 🌿 ESG Report Analyser\n"
"Upload a sustainability / ESG report PDF and explore it instantly."
)
with gr.Tab("πŸ“€ Upload"):
up_file = gr.File(label="ESG Report (PDF)", file_types=[".pdf"])
up_btn = gr.Button("Process Document", variant="primary")
up_out = gr.Markdown("Upload a PDF above and click **Process Document**.")
up_btn.click(handle_upload, up_file, up_out)
with gr.Tab("πŸ’¬ Q&A"):
q_box = gr.Textbox(label="Ask anything about the report",
placeholder="e.g. What are the carbon reduction targets?")
q_btn = gr.Button("Ask", variant="primary")
q_ans = gr.Markdown()
q_ev = gr.Markdown()
gr.Examples([
["What are the Scope 1 and 2 emissions?"],
["What diversity and inclusion initiatives are mentioned?"],
["What renewable energy commitments has the company made?"],
["What governance and audit policies are described?"],
["How does the company manage supply chain risks?"],
], inputs=q_box)
q_btn.click(handle_qa, q_box, [q_ans, q_ev])
with gr.Tab("πŸ“Š ESG Scores"):
s_btn = gr.Button("Compute ESG Scores", variant="primary")
s_out = gr.Markdown()
s_btn.click(handle_scores, outputs=s_out)
with gr.Tab("🚨 Greenwashing"):
g_btn = gr.Button("Scan for Greenwashing", variant="primary")
g_out = gr.Markdown()
g_btn.click(handle_greenwash, outputs=g_out)
with gr.Tab("πŸ•ΈοΈ Graph"):
d_btn = gr.Button("Build Discourse Graph", variant="primary")
d_out = gr.Markdown()
d_btn.click(handle_graph, outputs=d_out)
demo.launch()