Spaces:

CiLprototype
/

esg-intelligence

Sleeping

App Files Files Community

GirishaBuilds01 commited on Mar 13

Commit

e4c527f

verified ·

1 Parent(s): 1c87fd2

Update app.py

Browse files

Files changed (1) hide show

app.py +275 -158

app.py CHANGED Viewed

@@ -1,209 +1,326 @@
 """
-ESG Document Intelligence Prototype
-Qdrant vector search + Discourse Graph
 """
 import gradio as gr
 import re
 import json
 from pathlib import Path
-# ── lazy imports ──────────────────────────────────────────────────────────────
-def get_qdrant():
-    from qdrant_client import QdrantClient
-    from qdrant_client.models import Distance, VectorParams, PointStruct
-    return QdrantClient, Distance, VectorParams, PointStruct
-def get_embedder():
-    from sentence_transformers import SentenceTransformer
-    return SentenceTransformer("all-MiniLM-L6-v2")
-def get_pdfplumber():
-    import pdfplumber
-    return pdfplumber
-# ── Config ────────────────────────────────────────────────────────────────────
-COLLECTION = "esg"
-DIM = 384
 GREENWASHING_KW = [
     "carbon neutral", "net-zero", "net zero", "zero emissions",
     "100% renewable", "carbon offset", "zero waste", "eco-friendly",
-    "fully sustainable", "nature positive", "carbon negative"
 ]
-ESG_KW = {
-    "🌿 Environmental": ["carbon","emission","climate","renewable","energy","water","waste","pollution","solar","biodiversity"],
-    "👥 Social":        ["employee","diversity","inclusion","health","safety","human rights","labor","gender","community"],
-    "🏛️ Governance":   ["board","audit","compliance","ethics","transparency","corruption","disclosure","regulation","policy"]
 }
-# ── State ─────────────────────────────────────────────────────────────────────
-state = {"client": None, "embedder": None, "chunks": [], "name": "", "ready": False}
-# ── Init ──────────────────────────────────────────────────────────────────────
-def init():
-    if state["embedder"] is None:
-        state["embedder"] = get_embedder()
-    if state["client"] is None:
-        QdrantClient, Distance, VectorParams, _ = get_qdrant()
-        c = QdrantClient(":memory:")
-        c.recreate_collection(COLLECTION, vectors_config=VectorParams(size=DIM, distance=Distance.COSINE))
-        state["client"] = c
-# ── PDF + chunking ────────────────────────────────────────────────────────────
-def load_pdf(path):
-    pdfplumber = get_pdfplumber()
     pages = []
     with pdfplumber.open(path) as pdf:
         for i, p in enumerate(pdf.pages):
             t = (p.extract_text() or "").strip()
             if t:
-                pages.append({"page": i+1, "text": t})
     return pages
-def chunk(pages, size=250):
-    out = []
     for pg in pages:
-        words = pg["text"].split()
-        for s in range(0, len(words), size):
-            t = " ".join(words[s:s+size])
-            if len(t) > 30:
-                out.append({"page": pg["page"], "text": t})
     return out
-# ── Discourse role ────────────────────────────────────────────────────────────
-def role(text):
     t = text.lower()
-    if any(k in t for k in GREENWASHING_KW):            return "claim"
-    if any(k in t for k in ["%","tonnes","kwh","mwh"]): return "evidence"
-    if any(k in t for k in ["target","goal","by 2030","by 2050","we will","commitment"]): return "policy"
-    if any(k in t for k in ["kpi","metric","indicator"]): return "metric"
     return "context"
-# ── Handlers ──────────────────────────────────────────────────────────────────
-def process(pdf):
-    if pdf is None: return "⚠️ Upload a PDF first."
     try:
-        init()
-        _, _, _, PointStruct = get_qdrant()
-        pages  = load_pdf(pdf.name)
-        chunks = chunk(pages)
-        embeds = state["embedder"].encode([c["text"] for c in chunks], batch_size=32, normalize_embeddings=True)
-        state["client"].recreate_collection(COLLECTION,
-            vectors_config=__import__("qdrant_client").models.VectorParams(size=DIM,
-            distance=__import__("qdrant_client").models.Distance.COSINE))
-        pts = [PointStruct(id=i, vector=e.tolist(), payload={"page": c["page"], "text": c["text"]})
-               for i,(c,e) in enumerate(zip(chunks, embeds))]
-        state["client"].upsert(COLLECTION, pts)
-        state["chunks"] = chunks
-        state["name"]   = Path(pdf.name).name
-        state["ready"]  = True
-        roles = {}
-        for c in chunks:
-            r = role(c["text"]); roles[r] = roles.get(r,0)+1
-        return (f"✅ **{state['name']}** processed\n\n"
-                f"- Pages: **{len(pages)}** | Chunks: **{len(chunks)}**\n"
-                f"- Discourse nodes: `{json.dumps(roles)}`\n\n"
-                "Explore the tabs →")
     except Exception as e:
-        return f"❌ {e}"
-def ask(q):
-    if not state["ready"]: return "⚠️ Upload a document first.", ""
-    if not q.strip():      return "⚠️ Enter a question.", ""
-    vec = state["embedder"].encode([q], normalize_embeddings=True)[0].tolist()
-    hits = state["client"].search(COLLECTION, vec, limit=4, with_payload=True)
-    ans  = f"**Results from {state['name']}:**\n\n"
-    evid = "### 📎 Retrieved Evidence\n\n"
     for i, h in enumerate(hits, 1):
-        pg, txt = h.payload["page"], h.payload["text"]
-        r = role(txt)
-        ans  += f"📄 **Page {pg}:** {txt[:280]}…\n\n"
-        evid += f"**[{i}] Page {pg} | score {h.score:.3f} | role `{r}`**\n> {txt[:220]}…\n\n"
-    return ans, evid
-def esg_scores():
-    if not state["ready"]: return "⚠️ Upload a document first."
-    text = " ".join(c["text"] for c in state["chunks"]).lower()
-    counts = {k: sum(text.count(w) for w in ws) for k,ws in ESG_KW.items()}
-    total  = sum(counts.values()) or 1
-    scores = {k: round(v/total*100,1) for k,v in counts.items()}
-    overall = round(sum(scores.values())/3, 1)
-    def bar(v): return "█"*int(v/5) + "░"*(20-int(v/5))
-    rows = "\n".join(f"| {k} | {v}% | `{bar(v)}` |" for k,v in scores.items())
-    return (f"## 📊 ESG Scores — *{state['name']}*\n\n"
-            f"| Pillar | Score | Bar |\n|--------|-------|-----|\n{rows}\n"
-            f"| ⭐ Overall | **{overall}%** | `{bar(overall)}` |\n\n"
-            "> Keyword-density proxy scores.")
-def greenwashing():
-    if not state["ready"]: return "⚠️ Upload a document first."
-    flags, seen = [], set()
-    for c in state["chunks"]:
-        t = c["text"].lower()
-        matched = [k for k in GREENWASHING_KW if k in t]
-        if matched:
-            key = (c["page"], matched[0])
-            if key not in seen:
-                seen.add(key)
-                verified = any(w in t for w in ["certified","verified","audited","third party","sbti"])
-                flags.append({"page":c["page"],"kws":matched,"snip":c["text"][:200],"ok":verified})
-    if not flags: return "✅ No greenwashing keywords found."
     bad  = [f for f in flags if not f["ok"]]
     good = [f for f in flags if f["ok"]]
-    out  = [f"## 🚨 Greenwashing — *{state['name']}*\n",
-            f"{len(bad)} unverified ⚠️  |  {len(good)} with evidence ✅\n\n---\n"]
     if bad:
-        out.append("### ⚠️ Unverified\n")
         for f in bad:
-            out.append(f"📍 **Page {f['page']}** — `{'`, `'.join(f['kws'])}`\n> {f['snip']}…\n")
     if good:
-        out.append("\n### ✅ Evidenced\n")
         for f in good:
-            out.append(f"📍 **Page {f['page']}** — `{'`, `'.join(f['kws'])}`\n> {f['snip']}…\n")
     return "\n".join(out)
-def graph():
-    if not state["ready"]: return "⚠️ Upload a document first."
-    roles = {}
-    for c in state["chunks"]:
-        r = role(c["text"]); roles[r] = roles.get(r,0)+1
-    rows = "\n".join(f"| `{r}` | {n} |" for r,n in sorted(roles.items(), key=lambda x:-x[1]))
-    return (f"## 🕸️ Discourse Graph — *{state['name']}*\n\n"
-            f"| Role | Chunks |\n|------|--------|\n{rows}\n\n"
-            "**Edges modelled:**\n"
-            "- `follows` — sequential chunks\n"
-            "- `supported_by` — claim → evidence\n"
-            "- `measured_by` — policy → metric\n\n"
-            "> Graph expansion enables multi-hop HyperRAG retrieval.")
-# ── UI ────────────────────────────────────────────────────────────────────────
-with gr.Blocks(title="ESG Intelligence") as demo:
-    gr.Markdown("# 🌿 ESG Document Intelligence\n*Qdrant semantic search · Discourse graph reasoning*")
     with gr.Tab("📤 Upload"):
-        f   = gr.File(label="ESG Report PDF", file_types=[".pdf"])
-        btn = gr.Button("Process", variant="primary")
-        out = gr.Markdown("Upload a PDF and click Process.")
-        btn.click(process, f, out)
     with gr.Tab("💬 Q&A"):
-        q   = gr.Textbox(label="Question", placeholder="What are the carbon reduction targets?")
-        btn2 = gr.Button("Ask", variant="primary")
-        ans  = gr.Markdown()
-        ev   = gr.Markdown()
-        gr.Examples([["What are Scope 1 and 2 emissions?"],["What diversity initiatives exist?"],
-                     ["What are the renewable energy targets?"],["What governance policies are in place?"]], q)
-        btn2.click(ask, q, [ans, ev])
     with gr.Tab("📊 ESG Scores"):
-        gr.Button("Compute", variant="primary").click(esg_scores, outputs=gr.Markdown())
     with gr.Tab("🚨 Greenwashing"):
-        gr.Button("Detect", variant="primary").click(greenwashing, outputs=gr.Markdown())
-    with gr.Tab("🕸️ Discourse Graph"):
-        gr.Button("Show", variant="primary").click(graph, outputs=gr.Markdown())
 demo.launch()

 """
+ESG Report Analyser — working prototype for HuggingFace Spaces
+No ML models. No vector DB. Just pdfplumber + Gradio. Fully functional.
 """
 import gradio as gr
 import re
 import json
 from pathlib import Path
+from collections import Counter
+# ─────────────────────────────────────────────────────────────────────────────
+# CONFIG
+# ─────────────────────────────────────────────────────────────────────────────
 GREENWASHING_KW = [
     "carbon neutral", "net-zero", "net zero", "zero emissions",
     "100% renewable", "carbon offset", "zero waste", "eco-friendly",
+    "fully sustainable", "nature positive", "carbon negative",
+    "climate positive", "green certified", "biodegradable"
 ]
+ESG = {
+    "Environmental": ["carbon","emission","climate","renewable","energy","water",
+                      "waste","pollution","solar","wind","biodiversity","greenhouse",
+                      "deforestation","recycl","fossil"],
+    "Social":        ["employee","diversity","inclusion","health","safety",
+                      "human rights","labour","labor","gender","community",
+                      "training","wellbeing","wage","stakeholder"],
+    "Governance":    ["board","audit","compliance","ethics","transparent",
+                      "corruption","disclosure","regulation","policy",
+                      "shareholder","executive","accountability","risk"]
+}
+SECTORS = {
+    "Energy & Utilities":  ["oil","gas","electricity","utility","power plant"],
+    "Finance & Banking":   ["bank","investment","portfolio","loan","insurance"],
+    "Technology":          ["software","data center","cloud","semiconductor"],
+    "Manufacturing":       ["factory","manufacturing","production","supply chain"],
+    "Consumer Goods":      ["retail","consumer","packaging","brand","fmcg"],
+    "Healthcare":          ["pharmaceutical","medical","hospital","clinical"],
+    "Agriculture & Food":  ["agriculture","food","farming","crop","livestock"],
+    "Transportation":      ["aviation","shipping","fleet","transport","logistics"],
 }
+# ─────────────────────────────────────────────��───────────────────────────────
+# STATE
+# ─────────────────────────────────────────────────────────────────────────────
+doc = {"pages": [], "text": "", "name": ""}   # always reset on new upload
+# ─────────────────────────────────────────────────────────────────────────────
+# PDF PARSING
+# ─────────────────────────────────────────────────────────────────────────────
+def parse_pdf(path):
+    import pdfplumber
     pages = []
     with pdfplumber.open(path) as pdf:
         for i, p in enumerate(pdf.pages):
             t = (p.extract_text() or "").strip()
             if t:
+                pages.append({"page": i + 1, "text": t})
     return pages
+# ─────────────────────────────────────────────────────────────────────────────
+# SEARCH  (simple sentence-level keyword ranking — no model needed)
+# ─────────────────────────────────────────────────────────────────────────────
+def search(query, pages, top_k=5):
+    """Split every page into sentences, score by query word overlap, return best."""
+    q_words = set(re.sub(r"[^\w\s]", "", query.lower()).split())
+    candidates = []
     for pg in pages:
+        # split on period / newline
+        sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"])
+        for sent in sentences:
+            if len(sent.split()) < 5:
+                continue
+            score = sum(sent.lower().count(w) for w in q_words)
+            if score > 0:
+                candidates.append({"page": pg["page"], "text": sent.strip(), "score": score})
+    candidates.sort(key=lambda x: -x["score"])
+    # deduplicate by first 60 chars
+    seen, out = set(), []
+    for c in candidates:
+        key = c["text"][:60]
+        if key not in seen:
+            seen.add(key)
+            out.append(c)
+        if len(out) == top_k:
+            break
     return out
+# ─────────────────────────────────────────────────────────────────────────────
+# ANALYSIS HELPERS
+# ─────────────────────────────────────────────────────────────────────────────
+def esg_scores(text):
     t = text.lower()
+    raw = {k: sum(t.count(w) for w in ws) for k, ws in ESG.items()}
+    total = sum(raw.values()) or 1
+    return {k: round(v / total * 100, 1) for k, v in raw.items()}
+def detect_sector(text):
+    t = text.lower()
+    hits = {s: sum(t.count(w) for w in ws) for s, ws in SECTORS.items()}
+    best = max(hits, key=hits.get)
+    return best if hits[best] > 0 else "General / Diversified"
+def greenwash_flags(pages):
+    flags, seen = [], set()
+    for pg in pages:
+        t = pg["text"].lower()
+        matched = [kw for kw in GREENWASHING_KW if kw in t]
+        for kw in matched:
+            if (pg["page"], kw) not in seen:
+                seen.add((pg["page"], kw))
+                # grab the sentence containing the keyword
+                sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"])
+                snip = next((s for s in sentences if kw in s.lower()), pg["text"][:180])
+                verified = any(w in t for w in ["certified","verified","audited","third party","sbti","independently"])
+                flags.append({"page": pg["page"], "kw": kw, "snip": snip[:220], "ok": verified})
+    return flags
+def classify_sentence(s):
+    t = s.lower()
+    if any(k in t for k in GREENWASHING_KW):                         return "claim"
+    if any(k in t for k in ["%","tonne","kwh","mwh","litre","gallon"]): return "evidence"
+    if any(k in t for k in ["target","goal","by 2030","by 2050","we will","commit"]): return "policy"
+    if any(k in t for k in ["kpi","metric","indicator","index"]):     return "metric"
     return "context"
+def build_graph_summary(pages):
+    role_counts = Counter()
+    edges = {"follows": 0, "claim→evidence": 0, "policy→metric": 0}
+    prev_role = None
+    for pg in pages:
+        sentences = re.split(r"(?<=[.!?])\s+|\n", pg["text"])
+        for sent in sentences:
+            if len(sent.split()) < 4:
+                continue
+            r = classify_sentence(sent)
+            role_counts[r] += 1
+            if prev_role:
+                edges["follows"] += 1
+                if prev_role == "claim" and r == "evidence":
+                    edges["claim→evidence"] += 1
+                if prev_role == "policy" and r == "metric":
+                    edges["policy→metric"] += 1
+            prev_role = r
+    return role_counts, edges
+# ─────────────────────────────────────────────────────────────────────────────
+# GRADIO HANDLERS
+# ─────────────────────────────────────────────────────────────────────────────
+def handle_upload(pdf):
+    if pdf is None:
+        return "⚠️ Upload a PDF file."
     try:
+        pages = parse_pdf(pdf.name)
+        if not pages:
+            return "❌ No text found. Make sure the PDF is not a scanned image."
+        doc["pages"] = pages
+        doc["text"]  = " ".join(p["text"] for p in pages)
+        doc["name"]  = Path(pdf.name).name
+        role_c, _ = build_graph_summary(pages)
+        return (
+            f"✅ **{doc['name']}** loaded\n\n"
+            f"- **{len(pages)} pages** parsed\n"
+            f"- **{sum(role_c.values())} sentences** analysed\n"
+            f"- Node roles: `{dict(role_c)}`\n\n"
+            "Use the tabs above to explore the report."
+        )
     except Exception as e:
+        return f"❌ Error: {e}"
+def handle_qa(question):
+    if not doc["pages"]:
+        return "⚠️ Upload a document first.", ""
+    if not question.strip():
+        return "⚠️ Type a question.", ""
+    hits = search(question, doc["pages"])
+    if not hits:
+        return "Nothing relevant found. Try different keywords.", ""
+    answer = f"### Answer — *{doc['name']}*\n\n"
+    for h in hits:
+        answer += f"**Page {h['page']}:** {h['text']}\n\n"
+    evidence = "### 📎 Matched Sentences\n\n"
     for i, h in enumerate(hits, 1):
+        r = classify_sentence(h["text"])
+        evidence += f"**[{i}] Page {h['page']} · role `{r}` · score {h['score']}**\n> {h['text']}\n\n"
+    return answer, evidence
+def handle_scores():
+    if not doc["pages"]:
+        return "⚠️ Upload a document first."
+    scores  = esg_scores(doc["text"])
+    sector  = detect_sector(doc["text"])
+    overall = round(sum(scores.values()) / 3, 1)
+    def bar(v):
+        f = min(int(v / 5), 20)
+        return "█" * f + "░" * (20 - f)
+    icons = {"Environmental": "🌿", "Social": "👥", "Governance": "🏛️"}
+    rows  = "\n".join(
+        f"| {icons[k]} {k} | {v}% | `{bar(v)}` |"
+        for k, v in scores.items()
+    )
+    return (
+        f"## 📊 ESG Scores — *{doc['name']}*\n\n"
+        f"| Pillar | Score | Bar |\n|--------|-------|-----|\n{rows}\n"
+        f"| ⭐ Overall | **{overall}%** | `{bar(overall)}` |\n\n"
+        f"**Sector detected:** {sector}\n\n"
+        "> Scores reflect keyword frequency across the report."
+    )
+def handle_greenwash():
+    if not doc["pages"]:
+        return "⚠️ Upload a document first."
+    flags = greenwash_flags(doc["pages"])
+    if not flags:
+        return "✅ No greenwashing keywords detected in this document."
     bad  = [f for f in flags if not f["ok"]]
     good = [f for f in flags if f["ok"]]
+    out = [f"## 🚨 Greenwashing Scan — *{doc['name']}*\n",
+           f"**{len(bad)} unverified ⚠️** &nbsp;|&nbsp; **{len(good)} evidenced ✅**\n\n---\n"]
     if bad:
+        out.append("### ⚠️ Unverified Claims\n")
         for f in bad:
+            out.append(f"📍 **Page {f['page']}** — `{f['kw']}`\n> {f['snip']}\n")
     if good:
+        out.append("\n### ✅ Claims With Supporting Evidence\n")
         for f in good:
+            out.append(f"📍 **Page {f['page']}** — `{f['kw']}`\n> {f['snip']}\n")
     return "\n".join(out)
+def handle_graph():
+    if not doc["pages"]:
+        return "⚠️ Upload a document first."
+    role_c, edges = build_graph_summary(doc["pages"])
+    total_nodes = sum(role_c.values())
+    total_edges = sum(edges.values())
+    role_rows = "\n".join(
+        f"| `{r}` | {n} | {round(n/total_nodes*100,1)}% |"
+        for r, n in role_c.most_common()
+    )
+    edge_rows = "\n".join(f"| `{e}` | {n} |" for e, n in edges.items())
+    return (
+        f"## 🕸️ Discourse Graph — *{doc['name']}*\n\n"
+        f"**{total_nodes} nodes** (sentences) · **{total_edges} edges**\n\n"
+        f"### Node Roles\n| Role | Count | Share |\n|------|-------|-------|\n{role_rows}\n\n"
+        f"### Edge Types\n| Relation | Count |\n|----------|-------|\n{edge_rows}\n\n"
+        "**How edges are inferred:**\n"
+        "- Every consecutive sentence pair → `follows`\n"
+        "- `claim` followed by `evidence` → `claim→evidence`\n"
+        "- `policy` followed by `metric` → `policy→metric`\n\n"
+        "> These relations power multi-hop retrieval: a question hitting a **claim** node "
+        "automatically expands to its linked **evidence** nodes."
+    )
+# ─────────────────────────────────────────────────────────────────────────────
+# UI
+# ─────────────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="ESG Analyser") as demo:
+    gr.Markdown(
+        "# 🌿 ESG Report Analyser\n"
+        "Upload a sustainability / ESG report PDF and explore it instantly."
+    )
     with gr.Tab("📤 Upload"):
+        up_file = gr.File(label="ESG Report (PDF)", file_types=[".pdf"])
+        up_btn  = gr.Button("Process Document", variant="primary")
+        up_out  = gr.Markdown("Upload a PDF above and click **Process Document**.")
+        up_btn.click(handle_upload, up_file, up_out)
     with gr.Tab("💬 Q&A"):
+        q_box = gr.Textbox(label="Ask anything about the report",
+                           placeholder="e.g. What are the carbon reduction targets?")
+        q_btn = gr.Button("Ask", variant="primary")
+        q_ans = gr.Markdown()
+        q_ev  = gr.Markdown()
+        gr.Examples([
+            ["What are the Scope 1 and 2 emissions?"],
+            ["What diversity and inclusion initiatives are mentioned?"],
+            ["What renewable energy commitments has the company made?"],
+            ["What governance and audit policies are described?"],
+            ["How does the company manage supply chain risks?"],
+        ], inputs=q_box)
+        q_btn.click(handle_qa, q_box, [q_ans, q_ev])
     with gr.Tab("📊 ESG Scores"):
+        s_btn = gr.Button("Compute ESG Scores", variant="primary")
+        s_out = gr.Markdown()
+        s_btn.click(handle_scores, outputs=s_out)
     with gr.Tab("🚨 Greenwashing"):
+        g_btn = gr.Button("Scan for Greenwashing", variant="primary")
+        g_out = gr.Markdown()
+        g_btn.click(handle_greenwash, outputs=g_out)
+    with gr.Tab("🕸️ Graph"):
+        d_btn = gr.Button("Build Discourse Graph", variant="primary")
+        d_out = gr.Markdown()
+        d_btn.click(handle_graph, outputs=d_out)
 demo.launch()