Spaces:

CiLprototype
/

esg-intelligence

Sleeping

App Files Files Community

GirishaBuilds01 commited on Mar 13

Commit

8f76a50

verified ·

1 Parent(s): e7ee167

Update app.py

Browse files

Files changed (1) hide show

app.py +168 -237

app.py CHANGED Viewed

@@ -1,278 +1,209 @@
 """
 ESG Document Intelligence Prototype
-Simple, lean version for HuggingFace Spaces
 """
 import gradio as gr
 import re
 import json
-# ── Constants ─────────────────────────────────────────────────────────────────
-GREENWASHING_KEYWORDS = [
-    "carbon neutral", "net-zero", "net zero", "climate positive",
-    "100% renewable", "zero emissions", "carbon negative", "eco-friendly",
-    "carbon offset", "zero waste", "nature positive", "planet positive",
-    "fully sustainable", "green certified"
-]
-ESG_CATEGORIES = {
-    "🌿 Environmental": [
-        "carbon", "emission", "climate", "renewable", "energy", "water",
-        "waste", "biodiversity", "pollution", "recycling", "greenhouse",
-        "fossil fuel", "solar", "wind", "deforestation"
-    ],
-    "👥 Social": [
-        "employee", "diversity", "inclusion", "health", "safety", "community",
-        "human rights", "labor", "gender", "training", "wellbeing",
-        "stakeholder", "philanthropy", "education", "wage"
-    ],
-    "🏛️ Governance": [
-        "board", "director", "audit", "compliance", "ethics", "transparency",
-        "corruption", "risk management", "disclosure", "accountability",
-        "shareholder", "executive", "policy", "regulation"
-    ]
-}
-SECTOR_KEYWORDS = {
-    "Energy & Utilities":  ["oil", "gas", "electricity", "utility", "power plant", "pipeline"],
-    "Finance & Banking":   ["bank", "investment", "portfolio", "loan", "insurance"],
-    "Technology":          ["software", "data center", "cloud", "semiconductor"],
-    "Manufacturing":       ["factory", "manufacturing", "production", "supply chain"],
-    "Consumer Goods":      ["product", "retail", "consumer", "packaging", "brand"],
-    "Healthcare":          ["health", "pharmaceutical", "medical", "hospital"],
-    "Agriculture & Food":  ["agriculture", "food", "farming", "crop", "livestock"],
-    "Transportation":      ["transport", "aviation", "shipping", "fleet"],
-}
-# ── Global State ──────────────────────────────────────────────────────────────
-_doc_chunks = []
-_doc_name   = ""
-# ── Helpers ───────────────────────────────────────────────────────────────────
-def extract_text(pdf_path):
-    import pdfplumber
     pages = []
-    with pdfplumber.open(pdf_path) as pdf:
-        for i, page in enumerate(pdf.pages):
-            text = (page.extract_text() or "").strip()
-            if text:
-                pages.append({"page": i + 1, "text": text})
     return pages
-def simple_chunk(pages, size=300):
-    chunks = []
     for pg in pages:
         words = pg["text"].split()
-        for start in range(0, len(words), size):
-            chunk_text = " ".join(words[start:start + size])
-            if len(chunk_text) > 40:
-                chunks.append({"page": pg["page"], "text": chunk_text})
-    return chunks
-def keyword_search(query, chunks, top_k=4):
-    query_words = set(re.sub(r"[^\w\s]", "", query.lower()).split())
-    scored = []
-    for chunk in chunks:
-        score = sum(chunk["text"].lower().count(w) for w in query_words)
-        if score > 0:
-            scored.append((score, chunk))
-    scored.sort(key=lambda x: -x[0])
-    return [c for _, c in scored[:top_k]]
-def classify_role(text):
     t = text.lower()
-    if any(kw in t for kw in GREENWASHING_KEYWORDS):
-        return "claim"
-    if any(w in t for w in ["data shows", "%", "tonnes", "kwh", "mwh", "measured"]):
-        return "evidence"
-    if any(w in t for w in ["policy", "target", "goal", "by 2030", "by 2050", "we will"]):
-        return "policy"
-    if any(w in t for w in ["kpi", "metric", "indicator", "score", "rating"]):
-        return "metric"
     return "context"
-# ── Analysis ──────────────────────────────────────────────────────────────────
-def compute_esg_scores(chunks):
-    text = " ".join(c["text"] for c in chunks).lower()
-    counts = {k: sum(text.count(kw) for kw in kws) for k, kws in ESG_CATEGORIES.items()}
-    total = sum(counts.values()) or 1
-    return {k: round(v / total * 100, 1) for k, v in counts.items()}
-def detect_sector(chunks):
-    text = " ".join(c["text"] for c in chunks).lower()
-    hits = {s: sum(text.count(kw) for kw in kws) for s, kws in SECTOR_KEYWORDS.items()}
-    best = max(hits, key=hits.get)
-    return best if hits[best] > 0 else "General / Diversified"
-def detect_greenwashing(chunks):
     flags, seen = [], set()
-    for chunk in chunks:
-        t = chunk["text"].lower()
-        matched = [kw for kw in GREENWASHING_KEYWORDS if kw in t]
         if matched:
-            key = (chunk["page"], matched[0])
             if key not in seen:
                 seen.add(key)
-                verified = any(w in t for w in ["certified", "verified", "audited", "third party", "sbti"])
-                flags.append({"page": chunk["page"], "keywords": matched,
-                               "snip": chunk["text"][:200], "verified": verified})
-    return flags
-# ── Gradio Handlers ────────────────────────────────────────────────────────────
-def process_pdf(pdf_file):
-    global _doc_chunks, _doc_name
-    if pdf_file is None:
-        return "⚠️ Please upload a PDF."
-    try:
-        pages = extract_text(pdf_file.name)
-        if not pages:
-            return "❌ Could not extract text. Use a text-based PDF (not scanned)."
-        _doc_chunks = simple_chunk(pages)
-        _doc_name   = pdf_file.name.split("/")[-1]
-        roles = {}
-        for c in _doc_chunks:
-            r = classify_role(c["text"])
-            roles[r] = roles.get(r, 0) + 1
-        return (
-            f"✅ **Processed:** {_doc_name}\n\n"
-            f"- Pages: **{len(pages)}**\n"
-            f"- Chunks: **{len(_doc_chunks)}**\n"
-            f"- Discourse roles: `{json.dumps(roles)}`\n\n"
-            f"Now explore the other tabs!"
-        )
-    except Exception as e:
-        return f"❌ Error: {e}"
-def ask_question(question):
-    if not _doc_chunks:
-        return "⚠️ Upload a document first.", ""
-    if not question.strip():
-        return "⚠️ Enter a question.", ""
-    hits = keyword_search(question, _doc_chunks)
-    if not hits:
-        return "No relevant content found for that question.", ""
-    answer = f"Based on **{_doc_name}**, here is what was found:\n\n"
-    for c in hits:
-        answer += f"📄 *Page {c['page']}:* {c['text'][:300]}…\n\n"
-    evidence = "### 📎 Retrieved Chunks (keyword match)\n\n"
-    for i, c in enumerate(hits, 1):
-        role = classify_role(c["text"])
-        evidence += f"**[{i}] Page {c['page']} | role: `{role}`**\n> {c['text'][:250]}…\n\n"
-    return answer, evidence
-def show_esg_scores():
-    if not _doc_chunks:
-        return "⚠️ Upload a document first."
-    scores = compute_esg_scores(_doc_chunks)
-    sector = detect_sector(_doc_chunks)
-    def bar(v):
-        f = int(v / 5)
-        return "█" * f + "░" * (20 - f)
-    lines = [f"## 📊 ESG Scores — *{_doc_name}*\n",
-             "| Pillar | Score | Bar |", "|--------|-------|-----|"]
-    for pillar, score in scores.items():
-        lines.append(f"| {pillar} | {score}% | `{bar(score)}` |")
-    overall = round(sum(scores.values()) / 3, 1)
-    lines.append(f"| ⭐ **Overall** | **{overall}%** | `{bar(overall)}` |")
-    lines.append(f"\n**Detected Sector:** {sector}")
-    lines.append("\n> *Scores are keyword-density proxies for demonstration.*")
-    return "\n".join(lines)
-def show_greenwashing():
-    if not _doc_chunks:
-        return "⚠️ Upload a document first."
-    flags = detect_greenwashing(_doc_chunks)
-    if not flags:
-        return "✅ No greenwashing keywords detected."
-    unverified = [f for f in flags if not f["verified"]]
-    verified   = [f for f in flags if f["verified"]]
-    lines = [f"## 🚨 Greenwashing Report — *{_doc_name}*\n",
-             f"**Flagged:** {len(flags)} claims ({len(unverified)} ⚠️ unverified | {len(verified)} ✅ with evidence)\n\n---\n"]
-    if unverified:
-        lines.append("### ⚠️ Unverified Claims\n")
-        for f in unverified:
-            lines.append(f"📍 **Page {f['page']}** — {', '.join(f['keywords'])}\n> {f['snip']}…\n")
-    if verified:
-        lines.append("\n### ✅ Claims With Supporting Evidence\n")
-        for f in verified:
-            lines.append(f"📍 **Page {f['page']}** — {', '.join(f['keywords'])}\n> {f['snip']}…\n")
-    return "\n".join(lines)
-def show_graph():
-    if not _doc_chunks:
-        return "⚠️ Upload a document first."
     roles = {}
-    for c in _doc_chunks:
-        r = classify_role(c["text"])
-        roles[r] = roles.get(r, 0) + 1
-    return (
-        f"## 🕸️ Discourse Graph Summary — *{_doc_name}*\n\n"
-        "Chunks are classified into discourse roles and linked by typed edges.\n\n"
-        "| Role | Count | Meaning |\n"
-        "|------|-------|---------|\n"
-        f"| `claim`    | {roles.get('claim', 0)}    | Sustainability claims (greenwashing candidates) |\n"
-        f"| `evidence` | {roles.get('evidence', 0)} | Data, measurements, statistics |\n"
-        f"| `policy`   | {roles.get('policy', 0)}   | Commitments, targets, goals |\n"
-        f"| `metric`   | {roles.get('metric', 0)}   | KPIs and indicators |\n"
-        f"| `context`  | {roles.get('context', 0)}  | General narrative |\n\n"
-        "**Edge types:** `follows` · `supported_by` (claim→evidence) · `measured_by` (policy→metric)\n\n"
-        "> In a full HyperRAG deployment, these edges enable multi-hop reasoning across the document."
-    )
 # ── UI ────────────────────────────────────────────────────────────────────────
-with gr.Blocks(title="ESG Intelligence Prototype") as demo:
-    gr.Markdown("# 🌿 ESG Document Intelligence Prototype\n**HyperRAG + Discourse Graph** — upload an ESG PDF to explore")
     with gr.Tab("📤 Upload"):
-        pdf_in   = gr.File(label="ESG Report (PDF)", file_types=[".pdf"])
-        proc_btn = gr.Button("⚙️ Process Document", variant="primary")
-        proc_out = gr.Markdown("Upload a PDF and click Process.")
-        proc_btn.click(process_pdf, inputs=pdf_in, outputs=proc_out)
     with gr.Tab("💬 Q&A"):
-        q_in  = gr.Textbox(label="Question", placeholder="What are the carbon reduction targets?")
-        q_btn = gr.Button("Ask", variant="primary")
-        q_ans = gr.Markdown(label="Answer")
-        q_ev  = gr.Markdown(label="Evidence")
-        gr.Examples([
-            ["What are the Scope 1 and 2 emissions?"],
-            ["What diversity initiatives are mentioned?"],
-            ["What governance policies exist?"],
-            ["What renewable energy targets are set?"],
-        ], inputs=q_in)
-        q_btn.click(ask_question, inputs=q_in, outputs=[q_ans, q_ev])
     with gr.Tab("📊 ESG Scores"):
-        s_btn = gr.Button("Compute Scores", variant="primary")
-        s_out = gr.Markdown()
-        s_btn.click(show_esg_scores, outputs=s_out)
     with gr.Tab("🚨 Greenwashing"):
-        g_btn = gr.Button("Detect Claims", variant="primary")
-        g_out = gr.Markdown()
-        g_btn.click(show_greenwashing, outputs=g_out)
     with gr.Tab("🕸️ Discourse Graph"):
-        d_btn = gr.Button("Show Graph Summary", variant="primary")
-        d_out = gr.Markdown()
-        d_btn.click(show_graph, outputs=d_out)
 demo.launch()

 """
 ESG Document Intelligence Prototype
+Qdrant vector search + Discourse Graph
 """
 import gradio as gr
 import re
 import json
+from pathlib import Path
+# ── lazy imports ──────────────────────────────────────────────────────────────
+def get_qdrant():
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import Distance, VectorParams, PointStruct
+    return QdrantClient, Distance, VectorParams, PointStruct
+def get_embedder():
+    from sentence_transformers import SentenceTransformer
+    return SentenceTransformer("all-MiniLM-L6-v2")
+def get_pdfplumber():
+    import pdfplumber
+    return pdfplumber
+# ── Config ────────────────────────────────────────────────────────────────────
+COLLECTION = "esg"
+DIM = 384
+GREENWASHING_KW = [
+    "carbon neutral", "net-zero", "net zero", "zero emissions",
+    "100% renewable", "carbon offset", "zero waste", "eco-friendly",
+    "fully sustainable", "nature positive", "carbon negative"
+]
+ESG_KW = {
+    "🌿 Environmental": ["carbon","emission","climate","renewable","energy","water","waste","pollution","solar","biodiversity"],
+    "👥 Social":        ["employee","diversity","inclusion","health","safety","human rights","labor","gender","community"],
+    "🏛️ Governance":   ["board","audit","compliance","ethics","transparency","corruption","disclosure","regulation","policy"]
+}
+# ── State ─────────────────────────────────────────────────────────────────────
+state = {"client": None, "embedder": None, "chunks": [], "name": "", "ready": False}
+# ── Init ──────────────────────────────────────────────────────────────────────
+def init():
+    if state["embedder"] is None:
+        state["embedder"] = get_embedder()
+    if state["client"] is None:
+        QdrantClient, Distance, VectorParams, _ = get_qdrant()
+        c = QdrantClient(":memory:")
+        c.recreate_collection(COLLECTION, vectors_config=VectorParams(size=DIM, distance=Distance.COSINE))
+        state["client"] = c
+# ── PDF + chunking ────────────────────────────────────────────────────────────
+def load_pdf(path):
+    pdfplumber = get_pdfplumber()
     pages = []
+    with pdfplumber.open(path) as pdf:
+        for i, p in enumerate(pdf.pages):
+            t = (p.extract_text() or "").strip()
+            if t:
+                pages.append({"page": i+1, "text": t})
     return pages
+def chunk(pages, size=250):
+    out = []
     for pg in pages:
         words = pg["text"].split()
+        for s in range(0, len(words), size):
+            t = " ".join(words[s:s+size])
+            if len(t) > 30:
+                out.append({"page": pg["page"], "text": t})
+    return out
+# ── Discourse role ────────────────────────────────────────────────────────────
+def role(text):
     t = text.lower()
+    if any(k in t for k in GREENWASHING_KW):            return "claim"
+    if any(k in t for k in ["%","tonnes","kwh","mwh"]): return "evidence"
+    if any(k in t for k in ["target","goal","by 2030","by 2050","we will","commitment"]): return "policy"
+    if any(k in t for k in ["kpi","metric","indicator"]): return "metric"
     return "context"
+# ── Handlers ──────────────────────────────────────────────────────────────────
+def process(pdf):
+    if pdf is None: return "⚠️ Upload a PDF first."
+    try:
+        init()
+        _, _, _, PointStruct = get_qdrant()
+        pages  = load_pdf(pdf.name)
+        chunks = chunk(pages)
+        embeds = state["embedder"].encode([c["text"] for c in chunks], batch_size=32, normalize_embeddings=True)
+        state["client"].recreate_collection(COLLECTION,
+            vectors_config=__import__("qdrant_client").models.VectorParams(size=DIM,
+            distance=__import__("qdrant_client").models.Distance.COSINE))
+        pts = [PointStruct(id=i, vector=e.tolist(), payload={"page": c["page"], "text": c["text"]})
+               for i,(c,e) in enumerate(zip(chunks, embeds))]
+        state["client"].upsert(COLLECTION, pts)
+        state["chunks"] = chunks
+        state["name"]   = Path(pdf.name).name
+        state["ready"]  = True
+        roles = {}
+        for c in chunks:
+            r = role(c["text"]); roles[r] = roles.get(r,0)+1
+        return (f"✅ **{state['name']}** processed\n\n"
+                f"- Pages: **{len(pages)}** | Chunks: **{len(chunks)}**\n"
+                f"- Discourse nodes: `{json.dumps(roles)}`\n\n"
+                "Explore the tabs →")
+    except Exception as e:
+        return f"❌ {e}"
+def ask(q):
+    if not state["ready"]: return "⚠️ Upload a document first.", ""
+    if not q.strip():      return "⚠️ Enter a question.", ""
+    vec = state["embedder"].encode([q], normalize_embeddings=True)[0].tolist()
+    hits = state["client"].search(COLLECTION, vec, limit=4, with_payload=True)
+    ans  = f"**Results from {state['name']}:**\n\n"
+    evid = "### 📎 Retrieved Evidence\n\n"
+    for i, h in enumerate(hits, 1):
+        pg, txt = h.payload["page"], h.payload["text"]
+        r = role(txt)
+        ans  += f"📄 **Page {pg}:** {txt[:280]}…\n\n"
+        evid += f"**[{i}] Page {pg} | score {h.score:.3f} | role `{r}`**\n> {txt[:220]}…\n\n"
+    return ans, evid
+def esg_scores():
+    if not state["ready"]: return "⚠️ Upload a document first."
+    text = " ".join(c["text"] for c in state["chunks"]).lower()
+    counts = {k: sum(text.count(w) for w in ws) for k,ws in ESG_KW.items()}
+    total  = sum(counts.values()) or 1
+    scores = {k: round(v/total*100,1) for k,v in counts.items()}
+    overall = round(sum(scores.values())/3, 1)
+    def bar(v): return "█"*int(v/5) + "░"*(20-int(v/5))
+    rows = "\n".join(f"| {k} | {v}% | `{bar(v)}` |" for k,v in scores.items())
+    return (f"## 📊 ESG Scores — *{state['name']}*\n\n"
+            f"| Pillar | Score | Bar |\n|--------|-------|-----|\n{rows}\n"
+            f"| ⭐ Overall | **{overall}%** | `{bar(overall)}` |\n\n"
+            "> Keyword-density proxy scores.")
+def greenwashing():
+    if not state["ready"]: return "⚠️ Upload a document first."
     flags, seen = [], set()
+    for c in state["chunks"]:
+        t = c["text"].lower()
+        matched = [k for k in GREENWASHING_KW if k in t]
         if matched:
+            key = (c["page"], matched[0])
             if key not in seen:
                 seen.add(key)
+                verified = any(w in t for w in ["certified","verified","audited","third party","sbti"])
+                flags.append({"page":c["page"],"kws":matched,"snip":c["text"][:200],"ok":verified})
+    if not flags: return "✅ No greenwashing keywords found."
+    bad  = [f for f in flags if not f["ok"]]
+    good = [f for f in flags if f["ok"]]
+    out  = [f"## 🚨 Greenwashing — *{state['name']}*\n",
+            f"{len(bad)} unverified ⚠️  |  {len(good)} with evidence ✅\n\n---\n"]
+    if bad:
+        out.append("### ⚠️ Unverified\n")
+        for f in bad:
+            out.append(f"📍 **Page {f['page']}** — `{'`, `'.join(f['kws'])}`\n> {f['snip']}…\n")
+    if good:
+        out.append("\n### ✅ Evidenced\n")
+        for f in good:
+            out.append(f"📍 **Page {f['page']}** — `{'`, `'.join(f['kws'])}`\n> {f['snip']}…\n")
+    return "\n".join(out)
+def graph():
+    if not state["ready"]: return "⚠️ Upload a document first."
     roles = {}
+    for c in state["chunks"]:
+        r = role(c["text"]); roles[r] = roles.get(r,0)+1
+    rows = "\n".join(f"| `{r}` | {n} |" for r,n in sorted(roles.items(), key=lambda x:-x[1]))
+    return (f"## 🕸️ Discourse Graph — *{state['name']}*\n\n"
+            f"| Role | Chunks |\n|------|--------|\n{rows}\n\n"
+            "**Edges modelled:**\n"
+            "- `follows` — sequential chunks\n"
+            "- `supported_by` — claim → evidence\n"
+            "- `measured_by` — policy → metric\n\n"
+            "> Graph expansion enables multi-hop HyperRAG retrieval.")
 # ── UI ────────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="ESG Intelligence") as demo:
+    gr.Markdown("# 🌿 ESG Document Intelligence\n*Qdrant semantic search · Discourse graph reasoning*")
     with gr.Tab("📤 Upload"):
+        f   = gr.File(label="ESG Report PDF", file_types=[".pdf"])
+        btn = gr.Button("Process", variant="primary")
+        out = gr.Markdown("Upload a PDF and click Process.")
+        btn.click(process, f, out)
     with gr.Tab("💬 Q&A"):
+        q   = gr.Textbox(label="Question", placeholder="What are the carbon reduction targets?")
+        btn2 = gr.Button("Ask", variant="primary")
+        ans  = gr.Markdown()
+        ev   = gr.Markdown()
+        gr.Examples([["What are Scope 1 and 2 emissions?"],["What diversity initiatives exist?"],
+                     ["What are the renewable energy targets?"],["What governance policies are in place?"]], q)
+        btn2.click(ask, q, [ans, ev])
     with gr.Tab("📊 ESG Scores"):
+        gr.Button("Compute", variant="primary").click(esg_scores, outputs=gr.Markdown())
     with gr.Tab("🚨 Greenwashing"):
+        gr.Button("Detect", variant="primary").click(greenwashing, outputs=gr.Markdown())
     with gr.Tab("🕸️ Discourse Graph"):
+        gr.Button("Show", variant="primary").click(graph, outputs=gr.Markdown())
 demo.launch()