Spaces:

CiLprototype
/

esg-intelligence

Sleeping

App Files Files Community

GirishaBuilds01 commited on 6 days ago

Commit

bf43189

verified ·

1 Parent(s): c162115

Create app.py

Browse files

Files changed (1) hide show

app.py +794 -0

app.py ADDED Viewed

	@@ -0,0 +1,794 @@

+"""
+Multimodal ESG Document Intelligence Platform
+Using HyperRAG and Discourse Graph Reasoning
+"""
+import gradio as gr
+import os
+import json
+import re
+import time
+import hashlib
+from pathlib import Path
+import numpy as np
+# ── lazy imports ──────────────────────────────────────────────────────────────
+def _import_pdf():
+    import pdfplumber
+    return pdfplumber
+def _import_torch():
+    import torch
+    return torch
+def _import_transformers():
+    from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+    return pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+def _import_sentence_transformers():
+    from sentence_transformers import SentenceTransformer
+    return SentenceTransformer
+def _import_qdrant():
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import (
+        Distance, VectorParams, PointStruct, Filter,
+        FieldCondition, MatchValue
+    )
+    return QdrantClient, Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
+def _import_networkx():
+    import networkx as nx
+    return nx
+# ── Constants ─────────────────────────────────────────────────────────────────
+EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+QA_MODEL_NAME    = "google/flan-t5-base"
+COLLECTION_NAME  = "esg_documents"
+VECTOR_DIM       = 384
+CHUNK_SIZE       = 400       # tokens (approx words)
+CHUNK_OVERLAP    = 80
+TOP_K_RETRIEVAL  = 5
+GRAPH_HOP_DEPTH  = 2
+GREENWASHING_KEYWORDS = [
+    "carbon neutral", "net-zero", "net zero", "climate positive",
+    "100% renewable", "fully sustainable", "zero emissions",
+    "carbon negative", "eco-friendly", "green certified",
+    "environmentally responsible", "carbon offset", "carbon credits",
+    "biodegradable", "recyclable packaging", "zero waste",
+    "nature positive", "planet positive"
+]
+ESG_CATEGORIES = {
+    "environmental": [
+        "carbon", "emission", "climate", "renewable", "energy", "water",
+        "waste", "biodiversity", "deforestation", "pollution", "recycling",
+        "greenhouse", "sustainability", "fossil fuel", "solar", "wind"
+    ],
+    "social": [
+        "employee", "diversity", "inclusion", "health", "safety", "community",
+        "human rights", "labor", "gender", "training", "wellbeing", "supply chain",
+        "stakeholder", "philanthropy", "education", "wage"
+    ],
+    "governance": [
+        "board", "director", "audit", "compliance", "ethics", "transparency",
+        "corruption", "bribery", "risk management", "disclosure", "accountability",
+        "shareholder", "executive compensation", "policy", "regulation"
+    ]
+}
+# ── Global State ──────────────────────────────────────────────────────────────
+_state = {
+    "embed_model":    None,
+    "qa_pipeline":    None,
+    "qdrant_client":  None,
+    "discourse_graph": None,
+    "chunks":         [],
+    "doc_id":         None,
+    "doc_name":       "",
+    "is_ready":       False,
+}
+# ══════════════════════════════════════════════════════════════════════════════
+#  1.  MODEL LOADING
+# ══════════════════════════════════════════════════════════════════════════════
+def load_models():
+    """Load embedding model and QA pipeline (lazy, once)."""
+    if _state["embed_model"] is None:
+        SentenceTransformer = _import_sentence_transformers()
+        _state["embed_model"] = SentenceTransformer(EMBED_MODEL_NAME)
+    if _state["qa_pipeline"] is None:
+        pipeline, _, _ = _import_transformers()
+        _state["qa_pipeline"] = pipeline(
+            "text2text-generation",
+            model=QA_MODEL_NAME,
+            max_new_tokens=256,
+        )
+    if _state["qdrant_client"] is None:
+        QdrantClient, Distance, VectorParams, *_ = _import_qdrant()
+        client = QdrantClient(":memory:")
+        # Create collection
+        client.recreate_collection(
+            collection_name=COLLECTION_NAME,
+            vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),
+        )
+        _state["qdrant_client"] = client
+    return "✅ Models loaded successfully"
+# ══════════════════════════════════════════════════════════════════════════════
+#  2.  DOCUMENT PROCESSING
+# ══════════════════════════════════════════════════════════════════════════════
+def extract_text_from_pdf(pdf_path: str) -> list[dict]:
+    """Extract text per page from PDF. Returns list of {page, text}."""
+    pdfplumber = _import_pdf()
+    pages = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for i, page in enumerate(pdf.pages):
+            text = page.extract_text() or ""
+            text = text.strip()
+            if text:
+                pages.append({"page": i + 1, "text": text})
+    return pages
+def chunk_pages(pages: list[dict]) -> list[dict]:
+    """Chunk page texts with overlap. Returns list of {chunk_id, page, text}."""
+    chunks = []
+    chunk_id = 0
+    for pg in pages:
+        words = pg["text"].split()
+        start = 0
+        while start < len(words):
+            end = min(start + CHUNK_SIZE, len(words))
+            chunk_text = " ".join(words[start:end])
+            if len(chunk_text) > 50:          # skip tiny fragments
+                chunks.append({
+                    "chunk_id": chunk_id,
+                    "page":     pg["page"],
+                    "text":     chunk_text,
+                })
+                chunk_id += 1
+            start += CHUNK_SIZE - CHUNK_OVERLAP
+    return chunks
+def embed_chunks(chunks: list[dict]) -> np.ndarray:
+    """Generate embeddings for all chunks."""
+    texts = [c["text"] for c in chunks]
+    embeddings = _state["embed_model"].encode(
+        texts, batch_size=32, show_progress_bar=False, normalize_embeddings=True
+    )
+    return embeddings
+def index_chunks(chunks: list[dict], embeddings: np.ndarray, doc_id: str):
+    """Upsert chunk embeddings into Qdrant."""
+    QdrantClient, _, _, PointStruct, *_ = _import_qdrant()
+    client = _state["qdrant_client"]
+    points = []
+    for i, (chunk, vec) in enumerate(zip(chunks, embeddings)):
+        points.append(PointStruct(
+            id=i,
+            vector=vec.tolist(),
+            payload={
+                "chunk_id": chunk["chunk_id"],
+                "page":     chunk["page"],
+                "text":     chunk["text"],
+                "doc_id":   doc_id,
+            }
+        ))
+    client.upsert(collection_name=COLLECTION_NAME, points=points)
+# ══════════════════════════════════════════════════════════════════════════════
+#  3.  DISCOURSE GRAPH
+# ══════════════════════════════════════════════════════════════════════════════
+def classify_chunk_role(text: str) -> str:
+    """Classify chunk into ESG discourse role."""
+    text_lower = text.lower()
+    if any(kw in text_lower for kw in GREENWASHING_KEYWORDS):
+        return "claim"
+    if any(kw in text_lower for kw in ["data shows", "according to", "measured", "percent", "%", "tonnes", "mwh", "kwh"]):
+        return "evidence"
+    if any(kw in text_lower for kw in ["policy", "commitment", "we will", "target", "goal", "by 2030", "by 2050"]):
+        return "policy"
+    if any(kw in text_lower for kw in ["kpi", "metric", "indicator", "score", "rating", "index"]):
+        return "metric"
+    return "context"
+def build_discourse_graph(chunks: list[dict]) -> object:
+    """Build a NetworkX discourse graph from chunks."""
+    nx = _import_networkx()
+    G = nx.DiGraph()
+    for chunk in chunks:
+        role = classify_chunk_role(chunk["text"])
+        G.add_node(
+            chunk["chunk_id"],
+            text=chunk["text"],
+            page=chunk["page"],
+            role=role,
+        )
+    # Connect adjacent chunks (narrative continuity)
+    for i in range(len(chunks) - 1):
+        cid_a = chunks[i]["chunk_id"]
+        cid_b = chunks[i + 1]["chunk_id"]
+        G.add_edge(cid_a, cid_b, relation="follows")
+    # Connect claims to nearest evidence on same/adjacent page
+    claims   = [c for c in chunks if G.nodes[c["chunk_id"]]["role"] == "claim"]
+    evidence = [c for c in chunks if G.nodes[c["chunk_id"]]["role"] == "evidence"]
+    for cl in claims:
+        for ev in evidence:
+            if abs(cl["page"] - ev["page"]) <= 2:
+                G.add_edge(cl["chunk_id"], ev["chunk_id"], relation="supported_by")
+    # Connect policies to metrics
+    policies = [c for c in chunks if G.nodes[c["chunk_id"]]["role"] == "policy"]
+    metrics  = [c for c in chunks if G.nodes[c["chunk_id"]]["role"] == "metric"]
+    for po in policies:
+        for me in metrics:
+            if abs(po["page"] - me["page"]) <= 3:
+                G.add_edge(po["chunk_id"], me["chunk_id"], relation="measured_by")
+    return G
+# ══════════════════���═══════════════════════════════════════════════════════════
+#  4.  HyperRAG RETRIEVAL
+# ══════════════════════════════════════════════════════════════════════════════
+def vector_search(query: str, top_k: int = TOP_K_RETRIEVAL) -> list[dict]:
+    """Semantic vector search in Qdrant."""
+    qvec = _state["embed_model"].encode([query], normalize_embeddings=True)[0].tolist()
+    results = _state["qdrant_client"].search(
+        collection_name=COLLECTION_NAME,
+        query_vector=qvec,
+        limit=top_k,
+        with_payload=True,
+    )
+    return [
+        {
+            "chunk_id": r.payload["chunk_id"],
+            "page":     r.payload["page"],
+            "text":     r.payload["text"],
+            "score":    round(r.score, 4),
+        }
+        for r in results
+    ]
+def graph_expand(seed_chunk_ids: list[int], depth: int = GRAPH_HOP_DEPTH) -> list[int]:
+    """Expand context via discourse graph neighbourhood."""
+    nx = _import_networkx()
+    G  = _state["discourse_graph"]
+    if G is None:
+        return seed_chunk_ids
+    visited = set(seed_chunk_ids)
+    frontier = set(seed_chunk_ids)
+    for _ in range(depth):
+        next_frontier = set()
+        for node in frontier:
+            if G.has_node(node):
+                next_frontier |= set(G.successors(node))
+                next_frontier |= set(G.predecessors(node))
+        next_frontier -= visited
+        visited |= next_frontier
+        frontier = next_frontier
+    return list(visited)
+def hyper_rag_retrieve(query: str) -> list[dict]:
+    """
+    HyperRAG pipeline:
+      1. Vector search → seed chunks
+      2. Graph expansion → neighbour chunk IDs
+      3. Fetch neighbour chunks from state
+      4. Deduplicate & rank by original vector score
+    """
+    # Step 1 – vector search
+    seed_results = vector_search(query, top_k=TOP_K_RETRIEVAL)
+    seed_ids     = [r["chunk_id"] for r in seed_results]
+    score_map    = {r["chunk_id"]: r["score"] for r in seed_results}
+    # Step 2 – graph expansion
+    expanded_ids = graph_expand(seed_ids, depth=GRAPH_HOP_DEPTH)
+    # Step 3 – gather full chunk objects
+    chunk_map = {c["chunk_id"]: c for c in _state["chunks"]}
+    retrieved = []
+    for cid in expanded_ids:
+        if cid in chunk_map:
+            chunk = chunk_map[cid].copy()
+            chunk["score"] = score_map.get(cid, 0.0)
+            chunk["from_graph"] = cid not in score_map
+            retrieved.append(chunk)
+    # Step 4 – sort: vector hits first, then graph expansions
+    retrieved.sort(key=lambda x: (-int(not x["from_graph"]), -x["score"]))
+    return retrieved[:TOP_K_RETRIEVAL + 4]   # slightly more context for QA
+# ══════════════════════════════════════════════════════════════════════════════
+#  5.  ANALYSIS MODULES
+# ══════════════════════════════════════════════════════════════════════════════
+def compute_esg_scores(chunks: list[dict]) -> dict:
+    """Score E, S, G pillars from keyword density."""
+    scores = {"environmental": 0, "social": 0, "governance": 0}
+    total_words = 0
+    for chunk in chunks:
+        words = chunk["text"].lower().split()
+        total_words += len(words)
+        for pillar, keywords in ESG_CATEGORIES.items():
+            scores[pillar] += sum(words.count(kw) for kw in keywords)
+    if total_words == 0:
+        return {"environmental": 0, "social": 0, "governance": 0, "overall": 0}
+    # Normalise to 0-100
+    max_hits = max(scores.values()) or 1
+    norm = {k: round(min(v / max_hits * 100, 100), 1) for k, v in scores.items()}
+    norm["overall"] = round(sum(norm.values()) / 3, 1)
+    return norm
+def detect_sector(chunks: list[dict]) -> tuple[str, str]:
+    """Detect industry sector and key risk factors."""
+    sector_keywords = {
+        "Energy & Utilities":    ["oil", "gas", "electricity", "utility", "power plant", "pipeline"],
+        "Finance & Banking":     ["bank", "investment", "portfolio", "loan", "insurance", "asset"],
+        "Technology":            ["software", "data center", "cloud", "semiconductor", "hardware"],
+        "Manufacturing":         ["factory", "manufacturing", "production", "supply chain", "logistics"],
+        "Consumer Goods":        ["product", "retail", "consumer", "packaging", "brand"],
+        "Real Estate":           ["property", "building", "construction", "real estate", "infrastructure"],
+        "Healthcare":            ["health", "pharmaceutical", "medical", "hospital", "clinical"],
+        "Agriculture & Food":    ["agriculture", "food", "farming", "crop", "livestock"],
+        "Transportation":        ["transport", "aviation", "shipping", "fleet", "logistics"],
+    }
+    sector_risk = {
+        "Energy & Utilities":    "High carbon exposure, stranded asset risk, regulatory transition",
+        "Finance & Banking":     "ESG credit risk, greenwashing liability, regulatory compliance",
+        "Technology":            "E-waste, data privacy, supply chain ethics, energy usage",
+        "Manufacturing":         "Scope 3 emissions, labour rights, waste management",
+        "Consumer Goods":        "Packaging waste, supply chain transparency, greenwashing",
+        "Real Estate":           "Building energy efficiency, urban heat islands, climate resilience",
+        "Healthcare":            "Pharmaceutical waste, access to medicines, clinical trial ethics",
+        "Agriculture & Food":    "Land use, water scarcity, biodiversity loss, food waste",
+        "Transportation":        "Fleet emissions, fuel transition, last-mile logistics",
+    }
+    text = " ".join(c["text"] for c in chunks).lower()
+    sector_hits = {s: sum(text.count(kw) for kw in kws) for s, kws in sector_keywords.items()}
+    sector = max(sector_hits, key=sector_hits.get)
+    if sector_hits[sector] == 0:
+        sector = "General / Diversified"
+        risk   = "Cross-sector ESG exposure, disclosure quality, stakeholder engagement"
+    else:
+        risk = sector_risk[sector]
+    return sector, risk
+def detect_greenwashing(chunks: list[dict]) -> list[dict]:
+    """Flag chunks containing unsubstantiated green claims."""
+    flags = []
+    for chunk in chunks:
+        text_lower = chunk["text"].lower()
+        matched_kws = [kw for kw in GREENWASHING_KEYWORDS if kw in text_lower]
+        if matched_kws:
+            # Check whether same chunk or nearby has evidence
+            has_evidence = any(
+                word in text_lower
+                for word in ["verified", "certified", "third party", "audited",
+                             "iso", "sbti", "science-based", "independently"]
+            )
+            flags.append({
+                "page":      chunk["page"],
+                "keywords":  matched_kws,
+                "text_snip": chunk["text"][:220] + ("…" if len(chunk["text"]) > 220 else ""),
+                "verified":  has_evidence,
+            })
+    # Deduplicate by page + first keyword
+    seen = set()
+    unique_flags = []
+    for f in flags:
+        key = (f["page"], f["keywords"][0])
+        if key not in seen:
+            seen.add(key)
+            unique_flags.append(f)
+    return unique_flags
+def answer_question(question: str, context_chunks: list[dict]) -> str:
+    """Generate an answer using the QA model and retrieved context."""
+    context = "\n\n".join(
+        f"[Page {c['page']}] {c['text']}" for c in context_chunks
+    )
+    prompt = (
+        f"You are an expert ESG analyst. Answer the question based ONLY on the provided context.\n\n"
+        f"Context:\n{context[:3000]}\n\n"
+        f"Question: {question}\n\nAnswer:"
+    )
+    try:
+        result = _state["qa_pipeline"](prompt, max_new_tokens=256, do_sample=False)
+        return result[0]["generated_text"].strip()
+    except Exception as e:
+        return f"(Model error: {e})"
+# ══════════════════════════════════════════════════════════════════════════════
+#  6.  TOP-LEVEL PIPELINE
+# ══════════════════════════════════════════════════════════════════════════════
+def process_document(pdf_file) -> str:
+    """Full document ingestion pipeline."""
+    if pdf_file is None:
+        return "❌ Please upload a PDF file."
+    load_models()
+    pdf_path = pdf_file.name if hasattr(pdf_file, "name") else str(pdf_file)
+    doc_name = Path(pdf_path).name
+    doc_id   = hashlib.md5(doc_name.encode()).hexdigest()[:8]
+    # 1. Parse
+    pages = extract_text_from_pdf(pdf_path)
+    if not pages:
+        return "❌ Could not extract text from PDF. Try a text-based (not scanned) PDF."
+    # 2. Chunk
+    chunks = chunk_pages(pages)
+    if not chunks:
+        return "❌ Document appears empty after chunking."
+    # 3. Embed
+    embeddings = embed_chunks(chunks)
+    # 4. Re-create Qdrant collection (fresh per upload)
+    QdrantClient, Distance, VectorParams, *_ = _import_qdrant()
+    _state["qdrant_client"].recreate_collection(
+        collection_name=COLLECTION_NAME,
+        vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),
+    )
+    index_chunks(chunks, embeddings, doc_id)
+    # 5. Build discourse graph
+    _state["discourse_graph"] = build_discourse_graph(chunks)
+    # 6. Store state
+    _state["chunks"]   = chunks
+    _state["doc_id"]   = doc_id
+    _state["doc_name"] = doc_name
+    _state["is_ready"] = True
+    nx = _import_networkx()
+    G  = _state["discourse_graph"]
+    role_counts = {}
+    for n, d in G.nodes(data=True):
+        role_counts[d["role"]] = role_counts.get(d["role"], 0) + 1
+    return (
+        f"✅ **Document processed successfully!**\n\n"
+        f"📄 **File:** {doc_name}\n"
+        f"📑 **Pages parsed:** {len(pages)}\n"
+        f"🔷 **Chunks indexed:** {len(chunks)}\n"
+        f"🕸️ **Discourse graph nodes:** {G.number_of_nodes()} | "
+        f"edges: {G.number_of_edges()}\n"
+        f"🏷️ **Node roles:** {json.dumps(role_counts)}\n\n"
+        f"You can now ask questions, view ESG scores, or run greenwashing detection."
+    )
+# ══════════════════════════════════════════════════════════════════════════════
+#  7.  GRADIO HANDLER FUNCTIONS
+# ══════════════════════════════════════════════════════════════════════════════
+def handle_question(question: str) -> tuple[str, str]:
+    """Answer a user question and return answer + evidence panel."""
+    if not _state["is_ready"]:
+        return "⚠️ Please upload and process a document first.", ""
+    if not question.strip():
+        return "⚠️ Please enter a question.", ""
+    retrieved = hyper_rag_retrieve(question)
+    answer    = answer_question(question, retrieved)
+    evidence_lines = ["### 📎 Retrieved Evidence (HyperRAG)\n"]
+    for i, chunk in enumerate(retrieved, 1):
+        tag = "🔵 vector" if not chunk.get("from_graph") else "🟣 graph"
+        evidence_lines.append(
+            f"**[{i}] Page {chunk['page']} | score={chunk['score']:.3f} | {tag}**\n"
+            f"> {chunk['text'][:300]}…\n"
+        )
+    return answer, "\n".join(evidence_lines)
+def handle_esg_scores() -> str:
+    """Return ESG score panel."""
+    if not _state["is_ready"]:
+        return "⚠️ Please upload and process a document first."
+    scores = compute_esg_scores(_state["chunks"])
+    sector, risk = detect_sector(_state["chunks"])
+    bar = lambda v: "█" * int(v / 5) + "░" * (20 - int(v / 5))
+    return (
+        f"## 📊 ESG Score Analysis — *{_state['doc_name']}*\n\n"
+        f"| Pillar | Score | Bar |\n"
+        f"|--------|-------|-----|\n"
+        f"| 🌿 Environmental | {scores['environmental']:.1f}/100 | `{bar(scores['environmental'])}` |\n"
+        f"| 👥 Social        | {scores['social']:.1f}/100 | `{bar(scores['social'])}` |\n"
+        f"| 🏛️ Governance    | {scores['governance']:.1f}/100 | `{bar(scores['governance'])}` |\n"
+        f"| ⭐ **Overall**   | **{scores['overall']:.1f}/100** | `{bar(scores['overall'])}` |\n\n"
+        f"---\n"
+        f"### 🏭 Sector Detection\n"
+        f"**Identified Sector:** {sector}\n\n"
+        f"**Key Risk Factors:** {risk}\n\n"
+        f"> *Scores are keyword-density proxies. For investment decisions, use certified ESG ratings.*"
+    )
+def handle_greenwashing() -> str:
+    """Return greenwashing detection report."""
+    if not _state["is_ready"]:
+        return "⚠️ Please upload and process a document first."
+    flags = detect_greenwashing(_state["chunks"])
+    if not flags:
+        return (
+            "✅ **No greenwashing keywords detected** in this document.\n\n"
+            "The report does not contain common unsubstantiated sustainability claims."
+        )
+    unverified = [f for f in flags if not f["verified"]]
+    verified   = [f for f in flags if f["verified"]]
+    lines = [
+        f"## 🚨 Greenwashing Detection Report — *{_state['doc_name']}*\n",
+        f"**Total flagged claims:** {len(flags)} "
+        f"({len(unverified)} unverified ⚠️ | {len(verified)} with evidence ✅)\n\n---\n",
+    ]
+    if unverified:
+        lines.append("### ⚠️ Unverified Claims (Higher Risk)\n")
+        for f in unverified:
+            kws = ", ".join(f"**{k}**" for k in f["keywords"])
+            lines.append(
+                f"📍 **Page {f['page']}** — Keywords: {kws}\n"
+                f"> {f['text_snip']}\n"
+            )
+    if verified:
+        lines.append("\n### ✅ Claims With Supporting Evidence\n")
+        for f in verified:
+            kws = ", ".join(f"**{k}**" for k in f["keywords"])
+            lines.append(
+                f"📍 **Page {f['page']}** — Keywords: {kws}\n"
+                f"> {f['text_snip']}\n"
+            )
+    lines.append(
+        "\n---\n*Greenwashing detection is keyword-based. "
+        "Human expert review is recommended for investment decisions.*"
+    )
+    return "\n".join(lines)
+def handle_graph_insights() -> str:
+    """Return discourse graph statistics and top relations."""
+    if not _state["is_ready"]:
+        return "⚠️ Please upload and process a document first."
+    nx = _import_networkx()
+    G  = _state["discourse_graph"]
+    role_counts = {}
+    for n, d in G.nodes(data=True):
+        role_counts[d["role"]] = role_counts.get(d["role"], 0) + 1
+    edge_counts = {}
+    for u, v, d in G.edges(data=True):
+        rel = d.get("relation", "unknown")
+        edge_counts[rel] = edge_counts.get(rel, 0) + 1
+    # Top connected nodes
+    degree_seq = sorted(G.degree(), key=lambda x: x[1], reverse=True)[:5]
+    top_nodes  = []
+    for nid, deg in degree_seq:
+        ndata = G.nodes[nid]
+        top_nodes.append(
+            f"- Chunk {nid} | Page {ndata['page']} | Role: `{ndata['role']}` | Degree: {deg}"
+        )
+    return (
+        f"## 🕸️ Discourse Graph Insights — *{_state['doc_name']}*\n\n"
+        f"### Graph Statistics\n"
+        f"- **Nodes (chunks):** {G.number_of_nodes()}\n"
+        f"- **Edges (relations):** {G.number_of_edges()}\n\n"
+        f"### Node Roles\n"
+        + "\n".join(f"- `{r}`: {c}" for r, c in sorted(role_counts.items(), key=lambda x: -x[1]))
+        + "\n\n### Relation Types\n"
+        + "\n".join(f"- `{r}`: {c}" for r, c in sorted(edge_counts.items(), key=lambda x: -x[1]))
+        + "\n\n### 🔗 Most Connected Chunks (Hub Nodes)\n"
+        + "\n".join(top_nodes)
+        + "\n\n> *Hub nodes represent cross-referenced ESG statements — "
+          "key evidence or policy anchors in the report.*"
+    )
+# ══════════════════════════════════════════════════════════════════════════════
+#  8.  GRADIO UI
+# ══════════════════════════════════════════════════════════════════════════════
+CSS = """
+.gr-button-primary { background: #1a6b3c !important; }
+.gr-button { border-radius: 8px !important; }
+#title { text-align: center; margin-bottom: 0.5em; }
+#subtitle { text-align: center; color: #666; margin-bottom: 1.5em; }
+"""
+def build_ui():
+    with gr.Blocks(css=CSS, title="ESG Intelligence Platform") as demo:
+        gr.Markdown(
+            "# 🌿 Multimodal ESG Document Intelligence Platform\n"
+            "### HyperRAG + Discourse Graph Reasoning",
+            elem_id="title"
+        )
+        gr.Markdown(
+            "Upload an ESG report (PDF) to unlock **semantic Q&A**, "
+            "**ESG scoring**, **greenwashing detection**, and **graph-based reasoning**.",
+            elem_id="subtitle"
+        )
+        # ── Upload Tab ─────────────────────────────────────────────────────
+        with gr.Tab("📤 Upload & Process"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    pdf_input  = gr.File(label="Upload ESG Report (PDF)", file_types=[".pdf"])
+                    upload_btn = gr.Button("⚙️ Process Document", variant="primary")
+                with gr.Column(scale=3):
+                    upload_out = gr.Markdown(
+                        "Upload a PDF and click **Process Document** to begin."
+                    )
+            upload_btn.click(process_document, inputs=pdf_input, outputs=upload_out)
+        # ── Q&A Tab ────────────────────────────────────────────────────────
+        with gr.Tab("💬 Ask Questions (HyperRAG)"):
+            gr.Markdown(
+                "Ask any question about the ESG report. "
+                "The HyperRAG pipeline combines **vector search** with "
+                "**discourse graph expansion** for multi-hop reasoning."
+            )
+            question_input  = gr.Textbox(
+                placeholder="e.g. What are the company's carbon reduction targets?",
+                label="Your Question",
+                lines=2,
+            )
+            ask_btn   = gr.Button("🔍 Ask", variant="primary")
+            answer_md = gr.Markdown(label="Answer")
+            evid_md   = gr.Markdown(label="Supporting Evidence")
+            # Example questions
+            gr.Examples(
+                examples=[
+                    ["What are the company's Scope 1, 2, and 3 emissions?"],
+                    ["What diversity and inclusion initiatives are mentioned?"],
+                    ["What governance policies are in place for risk management?"],
+                    ["What renewable energy targets has the company set?"],
+                    ["How does the company address human rights in its supply chain?"],
+                ],
+                inputs=question_input,
+            )
+            ask_btn.click(handle_question, inputs=question_input, outputs=[answer_md, evid_md])
+        # ── ESG Scores Tab ─────────────────────────────────────────────────
+        with gr.Tab("📊 ESG Scores & Sector"):
+            score_btn = gr.Button("📈 Compute ESG Scores", variant="primary")
+            score_out = gr.Markdown()
+            score_btn.click(handle_esg_scores, outputs=score_out)
+        # ── Greenwashing Tab ───────────────────────────────────────────────
+        with gr.Tab("🚨 Greenwashing Detection"):
+            gr.Markdown(
+                "Detects unsubstantiated sustainability claims and links them "
+                "to **exact page numbers** in the report."
+            )
+            gw_btn = gr.Button("🔎 Detect Greenwashing Claims", variant="primary")
+            gw_out = gr.Markdown()
+            gw_btn.click(handle_greenwashing, outputs=gw_out)
+        # ── Graph Tab ──────────────────────────────────────────────────────
+        with gr.Tab("🕸️ Discourse Graph"):
+            gr.Markdown(
+                "View the **discourse graph** that models logical relationships "
+                "(claims, evidence, policies, metrics) between ESG statements."
+            )
+            graph_btn = gr.Button("🔬 Analyse Discourse Graph", variant="primary")
+            graph_out = gr.Markdown()
+            graph_btn.click(handle_graph_insights, outputs=graph_out)
+        # ── About Tab ──────────────────────────────────────────────────────
+        with gr.Tab("ℹ️ About"):
+            gr.Markdown("""
+## About This Platform
+This prototype implements the **Multimodal ESG Document Intelligence Platform**
+combining three advanced AI techniques:
+### 🔷 Architecture
+| Component | Technology |
+|-----------|------------|
+| Vector Store | Qdrant (in-memory) |
+| Embeddings | `sentence-transformers/all-MiniLM-L6-v2` |
+| Language Model | `google/flan-t5-base` |
+| Graph Engine | NetworkX DiGraph |
+| Retrieval | HyperRAG (vector + graph) |
+| Interface | Gradio |
+### 🔷 Pipeline
+```
+PDF Upload
+   ↓
+Text Extraction (pdfplumber, page-aware)
+   ↓
+Chunking with Overlap
+   ↓
+Sentence-Transformer Embeddings
+   ↓
+Qdrant Vector Index
+   ↓
+Discourse Graph Construction
+   (claims → evidence → policies → metrics)
+   ↓
+HyperRAG Retrieval
+   (vector search + graph neighbourhood expansion)
+   ↓
+Flan-T5 Answer Generation
+```
+### 🔷 Outputs
+- **Q&A** with page-level evidence
+- **ESG pillar scores** (E, S, G + overall)
+- **Sector detection** and risk factors
+- **Greenwashing flags** linked to pages
+- **Discourse graph** statistics and hub nodes
+### 🔷 Limitations
+- Scores are keyword-density heuristics, not certified ratings
+- Model is `flan-t5-base` for CPU compatibility; upgrade to larger models for production
+- Greenwashing detection is pattern-based and requires expert validation
+*Built for research and demonstration purposes.*
+            """)
+    return demo
+# ══════════════════════════════════════════════════════════════════════════════
+#  9.  ENTRY POINT
+# ══════════════════════════════════════════════════════════════════════════════
+if __name__ == "__main__":
+    demo = build_ui()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+    )