Spaces:
Sleeping
Sleeping
File size: 15,034 Bytes
bf43189 e4c527f bf43189 cb94bf6 8f76a50 e4c527f bf43189 e4c527f bf43189 8f76a50 e4c527f 8f76a50 bf43189 e4c527f 8f76a50 bf43189 e4c527f bf43189 8f76a50 e4c527f bf43189 e4c527f bf43189 e4c527f 8f76a50 e4c527f cb94bf6 e4c527f bf43189 e4c527f 8f76a50 e4c527f 8f76a50 e4c527f 8f76a50 e4c527f 8f76a50 e4c527f 8f76a50 e4c527f 8f76a50 e4c527f 8f76a50 e4c527f 8f76a50 e4c527f 8f76a50 e4c527f bf43189 cb94bf6 e4c527f bf43189 cb94bf6 e4c527f bf43189 cb94bf6 e4c527f bf43189 cb94bf6 e4c527f bf43189 e4c527f bf43189 cb94bf6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 | """
ESG Report Analyser β working prototype for HuggingFace Spaces
No ML models. No vector DB. Just pdfplumber + Gradio. Fully functional.
"""
import gradio as gr
import re
import json
from pathlib import Path
from collections import Counter
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# CONFIG
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
GREENWASHING_KW = [
"carbon neutral", "net-zero", "net zero", "zero emissions",
"100% renewable", "carbon offset", "zero waste", "eco-friendly",
"fully sustainable", "nature positive", "carbon negative",
"climate positive", "green certified", "biodegradable"
]
ESG = {
"Environmental": ["carbon","emission","climate","renewable","energy","water",
"waste","pollution","solar","wind","biodiversity","greenhouse",
"deforestation","recycl","fossil"],
"Social": ["employee","diversity","inclusion","health","safety",
"human rights","labour","labor","gender","community",
"training","wellbeing","wage","stakeholder"],
"Governance": ["board","audit","compliance","ethics","transparent",
"corruption","disclosure","regulation","policy",
"shareholder","executive","accountability","risk"]
}
SECTORS = {
"Energy & Utilities": ["oil","gas","electricity","utility","power plant"],
"Finance & Banking": ["bank","investment","portfolio","loan","insurance"],
"Technology": ["software","data center","cloud","semiconductor"],
"Manufacturing": ["factory","manufacturing","production","supply chain"],
"Consumer Goods": ["retail","consumer","packaging","brand","fmcg"],
"Healthcare": ["pharmaceutical","medical","hospital","clinical"],
"Agriculture & Food": ["agriculture","food","farming","crop","livestock"],
"Transportation": ["aviation","shipping","fleet","transport","logistics"],
}
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# STATE
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
doc = {"pages": [], "text": "", "name": ""} # always reset on new upload
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# PDF PARSING
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def parse_pdf(path):
import pdfplumber
pages = []
with pdfplumber.open(path) as pdf:
for i, p in enumerate(pdf.pages):
t = (p.extract_text() or "").strip()
if t:
pages.append({"page": i + 1, "text": t})
return pages
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SEARCH (simple sentence-level keyword ranking β no model needed)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def search(query, pages, top_k=5):
"""Split every page into sentences, score by query word overlap, return best."""
q_words = set(re.sub(r"[^\w\s]", "", query.lower()).split())
candidates = []
for pg in pages:
# split on period / newline
sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"])
for sent in sentences:
if len(sent.split()) < 5:
continue
score = sum(sent.lower().count(w) for w in q_words)
if score > 0:
candidates.append({"page": pg["page"], "text": sent.strip(), "score": score})
candidates.sort(key=lambda x: -x["score"])
# deduplicate by first 60 chars
seen, out = set(), []
for c in candidates:
key = c["text"][:60]
if key not in seen:
seen.add(key)
out.append(c)
if len(out) == top_k:
break
return out
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# ANALYSIS HELPERS
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def esg_scores(text):
t = text.lower()
raw = {k: sum(t.count(w) for w in ws) for k, ws in ESG.items()}
total = sum(raw.values()) or 1
return {k: round(v / total * 100, 1) for k, v in raw.items()}
def detect_sector(text):
t = text.lower()
hits = {s: sum(t.count(w) for w in ws) for s, ws in SECTORS.items()}
best = max(hits, key=hits.get)
return best if hits[best] > 0 else "General / Diversified"
def greenwash_flags(pages):
flags, seen = [], set()
for pg in pages:
t = pg["text"].lower()
matched = [kw for kw in GREENWASHING_KW if kw in t]
for kw in matched:
if (pg["page"], kw) not in seen:
seen.add((pg["page"], kw))
# grab the sentence containing the keyword
sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"])
snip = next((s for s in sentences if kw in s.lower()), pg["text"][:180])
verified = any(w in t for w in ["certified","verified","audited","third party","sbti","independently"])
flags.append({"page": pg["page"], "kw": kw, "snip": snip[:220], "ok": verified})
return flags
def classify_sentence(s):
t = s.lower()
if any(k in t for k in GREENWASHING_KW): return "claim"
if any(k in t for k in ["%","tonne","kwh","mwh","litre","gallon"]): return "evidence"
if any(k in t for k in ["target","goal","by 2030","by 2050","we will","commit"]): return "policy"
if any(k in t for k in ["kpi","metric","indicator","index"]): return "metric"
return "context"
def build_graph_summary(pages):
role_counts = Counter()
edges = {"follows": 0, "claimβevidence": 0, "policyβmetric": 0}
prev_role = None
for pg in pages:
sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"])
for sent in sentences:
if len(sent.split()) < 4:
continue
r = classify_sentence(sent)
role_counts[r] += 1
if prev_role:
edges["follows"] += 1
if prev_role == "claim" and r == "evidence":
edges["claimβevidence"] += 1
if prev_role == "policy" and r == "metric":
edges["policyβmetric"] += 1
prev_role = r
return role_counts, edges
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# GRADIO HANDLERS
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def handle_upload(pdf):
if pdf is None:
return "β οΈ Upload a PDF file."
try:
pages = parse_pdf(pdf.name)
if not pages:
return "β No text found. Make sure the PDF is not a scanned image."
doc["pages"] = pages
doc["text"] = " ".join(p["text"] for p in pages)
doc["name"] = Path(pdf.name).name
role_c, _ = build_graph_summary(pages)
return (
f"β
**{doc['name']}** loaded\n\n"
f"- **{len(pages)} pages** parsed\n"
f"- **{sum(role_c.values())} sentences** analysed\n"
f"- Node roles: `{dict(role_c)}`\n\n"
"Use the tabs above to explore the report."
)
except Exception as e:
return f"β Error: {e}"
def handle_qa(question):
if not doc["pages"]:
return "β οΈ Upload a document first.", ""
if not question.strip():
return "β οΈ Type a question.", ""
hits = search(question, doc["pages"])
if not hits:
return "Nothing relevant found. Try different keywords.", ""
answer = f"### Answer β *{doc['name']}*\n\n"
for h in hits:
answer += f"**Page {h['page']}:** {h['text']}\n\n"
evidence = "### π Matched Sentences\n\n"
for i, h in enumerate(hits, 1):
r = classify_sentence(h["text"])
evidence += f"**[{i}] Page {h['page']} Β· role `{r}` Β· score {h['score']}**\n> {h['text']}\n\n"
return answer, evidence
def handle_scores():
if not doc["pages"]:
return "β οΈ Upload a document first."
scores = esg_scores(doc["text"])
sector = detect_sector(doc["text"])
overall = round(sum(scores.values()) / 3, 1)
def bar(v):
f = min(int(v / 5), 20)
return "β" * f + "β" * (20 - f)
icons = {"Environmental": "πΏ", "Social": "π₯", "Governance": "ποΈ"}
rows = "\n".join(
f"| {icons[k]} {k} | {v}% | `{bar(v)}` |"
for k, v in scores.items()
)
return (
f"## π ESG Scores β *{doc['name']}*\n\n"
f"| Pillar | Score | Bar |\n|--------|-------|-----|\n{rows}\n"
f"| β Overall | **{overall}%** | `{bar(overall)}` |\n\n"
f"**Sector detected:** {sector}\n\n"
"> Scores reflect keyword frequency across the report."
)
def handle_greenwash():
if not doc["pages"]:
return "β οΈ Upload a document first."
flags = greenwash_flags(doc["pages"])
if not flags:
return "β
No greenwashing keywords detected in this document."
bad = [f for f in flags if not f["ok"]]
good = [f for f in flags if f["ok"]]
out = [f"## π¨ Greenwashing Scan β *{doc['name']}*\n",
f"**{len(bad)} unverified β οΈ** | **{len(good)} evidenced β
**\n\n---\n"]
if bad:
out.append("### β οΈ Unverified Claims\n")
for f in bad:
out.append(f"π **Page {f['page']}** β `{f['kw']}`\n> {f['snip']}\n")
if good:
out.append("\n### β
Claims With Supporting Evidence\n")
for f in good:
out.append(f"π **Page {f['page']}** β `{f['kw']}`\n> {f['snip']}\n")
return "\n".join(out)
def handle_graph():
if not doc["pages"]:
return "β οΈ Upload a document first."
role_c, edges = build_graph_summary(doc["pages"])
total_nodes = sum(role_c.values())
total_edges = sum(edges.values())
role_rows = "\n".join(
f"| `{r}` | {n} | {round(n/total_nodes*100,1)}% |"
for r, n in role_c.most_common()
)
edge_rows = "\n".join(f"| `{e}` | {n} |" for e, n in edges.items())
return (
f"## πΈοΈ Discourse Graph β *{doc['name']}*\n\n"
f"**{total_nodes} nodes** (sentences) Β· **{total_edges} edges**\n\n"
f"### Node Roles\n| Role | Count | Share |\n|------|-------|-------|\n{role_rows}\n\n"
f"### Edge Types\n| Relation | Count |\n|----------|-------|\n{edge_rows}\n\n"
"**How edges are inferred:**\n"
"- Every consecutive sentence pair β `follows`\n"
"- `claim` followed by `evidence` β `claimβevidence`\n"
"- `policy` followed by `metric` β `policyβmetric`\n\n"
"> These relations power multi-hop retrieval: a question hitting a **claim** node "
"automatically expands to its linked **evidence** nodes."
)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# UI
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Blocks(title="ESG Analyser") as demo:
gr.Markdown(
"# πΏ ESG Report Analyser\n"
"Upload a sustainability / ESG report PDF and explore it instantly."
)
with gr.Tab("π€ Upload"):
up_file = gr.File(label="ESG Report (PDF)", file_types=[".pdf"])
up_btn = gr.Button("Process Document", variant="primary")
up_out = gr.Markdown("Upload a PDF above and click **Process Document**.")
up_btn.click(handle_upload, up_file, up_out)
with gr.Tab("π¬ Q&A"):
q_box = gr.Textbox(label="Ask anything about the report",
placeholder="e.g. What are the carbon reduction targets?")
q_btn = gr.Button("Ask", variant="primary")
q_ans = gr.Markdown()
q_ev = gr.Markdown()
gr.Examples([
["What are the Scope 1 and 2 emissions?"],
["What diversity and inclusion initiatives are mentioned?"],
["What renewable energy commitments has the company made?"],
["What governance and audit policies are described?"],
["How does the company manage supply chain risks?"],
], inputs=q_box)
q_btn.click(handle_qa, q_box, [q_ans, q_ev])
with gr.Tab("π ESG Scores"):
s_btn = gr.Button("Compute ESG Scores", variant="primary")
s_out = gr.Markdown()
s_btn.click(handle_scores, outputs=s_out)
with gr.Tab("π¨ Greenwashing"):
g_btn = gr.Button("Scan for Greenwashing", variant="primary")
g_out = gr.Markdown()
g_btn.click(handle_greenwash, outputs=g_out)
with gr.Tab("πΈοΈ Graph"):
d_btn = gr.Button("Build Discourse Graph", variant="primary")
d_out = gr.Markdown()
d_btn.click(handle_graph, outputs=d_out)
demo.launch() |