GirishaBuilds01 commited on
Commit
bf43189
Β·
verified Β·
1 Parent(s): c162115

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +794 -0
app.py ADDED
@@ -0,0 +1,794 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multimodal ESG Document Intelligence Platform
3
+ Using HyperRAG and Discourse Graph Reasoning
4
+ """
5
+
6
+ import gradio as gr
7
+ import os
8
+ import json
9
+ import re
10
+ import time
11
+ import hashlib
12
+ from pathlib import Path
13
+ import numpy as np
14
+
15
+ # ── lazy imports ──────────────────────────────────────────────────────────────
16
+ def _import_pdf():
17
+ import pdfplumber
18
+ return pdfplumber
19
+
20
+ def _import_torch():
21
+ import torch
22
+ return torch
23
+
24
+ def _import_transformers():
25
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
26
+ return pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
27
+
28
+ def _import_sentence_transformers():
29
+ from sentence_transformers import SentenceTransformer
30
+ return SentenceTransformer
31
+
32
+ def _import_qdrant():
33
+ from qdrant_client import QdrantClient
34
+ from qdrant_client.models import (
35
+ Distance, VectorParams, PointStruct, Filter,
36
+ FieldCondition, MatchValue
37
+ )
38
+ return QdrantClient, Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
39
+
40
+ def _import_networkx():
41
+ import networkx as nx
42
+ return nx
43
+
44
+ # ── Constants ─────────────────────────────────────────────────────────────────
45
+ EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
46
+ QA_MODEL_NAME = "google/flan-t5-base"
47
+ COLLECTION_NAME = "esg_documents"
48
+ VECTOR_DIM = 384
49
+ CHUNK_SIZE = 400 # tokens (approx words)
50
+ CHUNK_OVERLAP = 80
51
+ TOP_K_RETRIEVAL = 5
52
+ GRAPH_HOP_DEPTH = 2
53
+
54
+ GREENWASHING_KEYWORDS = [
55
+ "carbon neutral", "net-zero", "net zero", "climate positive",
56
+ "100% renewable", "fully sustainable", "zero emissions",
57
+ "carbon negative", "eco-friendly", "green certified",
58
+ "environmentally responsible", "carbon offset", "carbon credits",
59
+ "biodegradable", "recyclable packaging", "zero waste",
60
+ "nature positive", "planet positive"
61
+ ]
62
+
63
+ ESG_CATEGORIES = {
64
+ "environmental": [
65
+ "carbon", "emission", "climate", "renewable", "energy", "water",
66
+ "waste", "biodiversity", "deforestation", "pollution", "recycling",
67
+ "greenhouse", "sustainability", "fossil fuel", "solar", "wind"
68
+ ],
69
+ "social": [
70
+ "employee", "diversity", "inclusion", "health", "safety", "community",
71
+ "human rights", "labor", "gender", "training", "wellbeing", "supply chain",
72
+ "stakeholder", "philanthropy", "education", "wage"
73
+ ],
74
+ "governance": [
75
+ "board", "director", "audit", "compliance", "ethics", "transparency",
76
+ "corruption", "bribery", "risk management", "disclosure", "accountability",
77
+ "shareholder", "executive compensation", "policy", "regulation"
78
+ ]
79
+ }
80
+
81
+ # ── Global State ──────────────────────────────────────────────────────────────
82
+ _state = {
83
+ "embed_model": None,
84
+ "qa_pipeline": None,
85
+ "qdrant_client": None,
86
+ "discourse_graph": None,
87
+ "chunks": [],
88
+ "doc_id": None,
89
+ "doc_name": "",
90
+ "is_ready": False,
91
+ }
92
+
93
+ # ══════════════════════════════════════════════════════════════════════════════
94
+ # 1. MODEL LOADING
95
+ # ══════════════════════════════════════════════════════════════════════════════
96
+
97
+ def load_models():
98
+ """Load embedding model and QA pipeline (lazy, once)."""
99
+ if _state["embed_model"] is None:
100
+ SentenceTransformer = _import_sentence_transformers()
101
+ _state["embed_model"] = SentenceTransformer(EMBED_MODEL_NAME)
102
+
103
+ if _state["qa_pipeline"] is None:
104
+ pipeline, _, _ = _import_transformers()
105
+ _state["qa_pipeline"] = pipeline(
106
+ "text2text-generation",
107
+ model=QA_MODEL_NAME,
108
+ max_new_tokens=256,
109
+ )
110
+
111
+ if _state["qdrant_client"] is None:
112
+ QdrantClient, Distance, VectorParams, *_ = _import_qdrant()
113
+ client = QdrantClient(":memory:")
114
+ # Create collection
115
+ client.recreate_collection(
116
+ collection_name=COLLECTION_NAME,
117
+ vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),
118
+ )
119
+ _state["qdrant_client"] = client
120
+
121
+ return "βœ… Models loaded successfully"
122
+
123
+
124
+ # ══════════════════════════════════════════════════════════════════════════════
125
+ # 2. DOCUMENT PROCESSING
126
+ # ══════════════════════════════════════════════════════════════════════════════
127
+
128
+ def extract_text_from_pdf(pdf_path: str) -> list[dict]:
129
+ """Extract text per page from PDF. Returns list of {page, text}."""
130
+ pdfplumber = _import_pdf()
131
+ pages = []
132
+ with pdfplumber.open(pdf_path) as pdf:
133
+ for i, page in enumerate(pdf.pages):
134
+ text = page.extract_text() or ""
135
+ text = text.strip()
136
+ if text:
137
+ pages.append({"page": i + 1, "text": text})
138
+ return pages
139
+
140
+
141
+ def chunk_pages(pages: list[dict]) -> list[dict]:
142
+ """Chunk page texts with overlap. Returns list of {chunk_id, page, text}."""
143
+ chunks = []
144
+ chunk_id = 0
145
+ for pg in pages:
146
+ words = pg["text"].split()
147
+ start = 0
148
+ while start < len(words):
149
+ end = min(start + CHUNK_SIZE, len(words))
150
+ chunk_text = " ".join(words[start:end])
151
+ if len(chunk_text) > 50: # skip tiny fragments
152
+ chunks.append({
153
+ "chunk_id": chunk_id,
154
+ "page": pg["page"],
155
+ "text": chunk_text,
156
+ })
157
+ chunk_id += 1
158
+ start += CHUNK_SIZE - CHUNK_OVERLAP
159
+ return chunks
160
+
161
+
162
+ def embed_chunks(chunks: list[dict]) -> np.ndarray:
163
+ """Generate embeddings for all chunks."""
164
+ texts = [c["text"] for c in chunks]
165
+ embeddings = _state["embed_model"].encode(
166
+ texts, batch_size=32, show_progress_bar=False, normalize_embeddings=True
167
+ )
168
+ return embeddings
169
+
170
+
171
+ def index_chunks(chunks: list[dict], embeddings: np.ndarray, doc_id: str):
172
+ """Upsert chunk embeddings into Qdrant."""
173
+ QdrantClient, _, _, PointStruct, *_ = _import_qdrant()
174
+ client = _state["qdrant_client"]
175
+ points = []
176
+ for i, (chunk, vec) in enumerate(zip(chunks, embeddings)):
177
+ points.append(PointStruct(
178
+ id=i,
179
+ vector=vec.tolist(),
180
+ payload={
181
+ "chunk_id": chunk["chunk_id"],
182
+ "page": chunk["page"],
183
+ "text": chunk["text"],
184
+ "doc_id": doc_id,
185
+ }
186
+ ))
187
+ client.upsert(collection_name=COLLECTION_NAME, points=points)
188
+
189
+
190
+ # ══════════════════════════════════════════════════════════════════════════════
191
+ # 3. DISCOURSE GRAPH
192
+ # ══════════════════════════════════════════════════════════════════════════════
193
+
194
+ def classify_chunk_role(text: str) -> str:
195
+ """Classify chunk into ESG discourse role."""
196
+ text_lower = text.lower()
197
+ if any(kw in text_lower for kw in GREENWASHING_KEYWORDS):
198
+ return "claim"
199
+ if any(kw in text_lower for kw in ["data shows", "according to", "measured", "percent", "%", "tonnes", "mwh", "kwh"]):
200
+ return "evidence"
201
+ if any(kw in text_lower for kw in ["policy", "commitment", "we will", "target", "goal", "by 2030", "by 2050"]):
202
+ return "policy"
203
+ if any(kw in text_lower for kw in ["kpi", "metric", "indicator", "score", "rating", "index"]):
204
+ return "metric"
205
+ return "context"
206
+
207
+
208
+ def build_discourse_graph(chunks: list[dict]) -> object:
209
+ """Build a NetworkX discourse graph from chunks."""
210
+ nx = _import_networkx()
211
+ G = nx.DiGraph()
212
+
213
+ for chunk in chunks:
214
+ role = classify_chunk_role(chunk["text"])
215
+ G.add_node(
216
+ chunk["chunk_id"],
217
+ text=chunk["text"],
218
+ page=chunk["page"],
219
+ role=role,
220
+ )
221
+
222
+ # Connect adjacent chunks (narrative continuity)
223
+ for i in range(len(chunks) - 1):
224
+ cid_a = chunks[i]["chunk_id"]
225
+ cid_b = chunks[i + 1]["chunk_id"]
226
+ G.add_edge(cid_a, cid_b, relation="follows")
227
+
228
+ # Connect claims to nearest evidence on same/adjacent page
229
+ claims = [c for c in chunks if G.nodes[c["chunk_id"]]["role"] == "claim"]
230
+ evidence = [c for c in chunks if G.nodes[c["chunk_id"]]["role"] == "evidence"]
231
+ for cl in claims:
232
+ for ev in evidence:
233
+ if abs(cl["page"] - ev["page"]) <= 2:
234
+ G.add_edge(cl["chunk_id"], ev["chunk_id"], relation="supported_by")
235
+
236
+ # Connect policies to metrics
237
+ policies = [c for c in chunks if G.nodes[c["chunk_id"]]["role"] == "policy"]
238
+ metrics = [c for c in chunks if G.nodes[c["chunk_id"]]["role"] == "metric"]
239
+ for po in policies:
240
+ for me in metrics:
241
+ if abs(po["page"] - me["page"]) <= 3:
242
+ G.add_edge(po["chunk_id"], me["chunk_id"], relation="measured_by")
243
+
244
+ return G
245
+
246
+
247
+ # ══════════════════���═══════════════════════════════════════════════════════════
248
+ # 4. HyperRAG RETRIEVAL
249
+ # ══════════════════════════════════════════════════════════════════════════════
250
+
251
+ def vector_search(query: str, top_k: int = TOP_K_RETRIEVAL) -> list[dict]:
252
+ """Semantic vector search in Qdrant."""
253
+ qvec = _state["embed_model"].encode([query], normalize_embeddings=True)[0].tolist()
254
+ results = _state["qdrant_client"].search(
255
+ collection_name=COLLECTION_NAME,
256
+ query_vector=qvec,
257
+ limit=top_k,
258
+ with_payload=True,
259
+ )
260
+ return [
261
+ {
262
+ "chunk_id": r.payload["chunk_id"],
263
+ "page": r.payload["page"],
264
+ "text": r.payload["text"],
265
+ "score": round(r.score, 4),
266
+ }
267
+ for r in results
268
+ ]
269
+
270
+
271
+ def graph_expand(seed_chunk_ids: list[int], depth: int = GRAPH_HOP_DEPTH) -> list[int]:
272
+ """Expand context via discourse graph neighbourhood."""
273
+ nx = _import_networkx()
274
+ G = _state["discourse_graph"]
275
+ if G is None:
276
+ return seed_chunk_ids
277
+
278
+ visited = set(seed_chunk_ids)
279
+ frontier = set(seed_chunk_ids)
280
+ for _ in range(depth):
281
+ next_frontier = set()
282
+ for node in frontier:
283
+ if G.has_node(node):
284
+ next_frontier |= set(G.successors(node))
285
+ next_frontier |= set(G.predecessors(node))
286
+ next_frontier -= visited
287
+ visited |= next_frontier
288
+ frontier = next_frontier
289
+
290
+ return list(visited)
291
+
292
+
293
+ def hyper_rag_retrieve(query: str) -> list[dict]:
294
+ """
295
+ HyperRAG pipeline:
296
+ 1. Vector search β†’ seed chunks
297
+ 2. Graph expansion β†’ neighbour chunk IDs
298
+ 3. Fetch neighbour chunks from state
299
+ 4. Deduplicate & rank by original vector score
300
+ """
301
+ # Step 1 – vector search
302
+ seed_results = vector_search(query, top_k=TOP_K_RETRIEVAL)
303
+ seed_ids = [r["chunk_id"] for r in seed_results]
304
+ score_map = {r["chunk_id"]: r["score"] for r in seed_results}
305
+
306
+ # Step 2 – graph expansion
307
+ expanded_ids = graph_expand(seed_ids, depth=GRAPH_HOP_DEPTH)
308
+
309
+ # Step 3 – gather full chunk objects
310
+ chunk_map = {c["chunk_id"]: c for c in _state["chunks"]}
311
+ retrieved = []
312
+ for cid in expanded_ids:
313
+ if cid in chunk_map:
314
+ chunk = chunk_map[cid].copy()
315
+ chunk["score"] = score_map.get(cid, 0.0)
316
+ chunk["from_graph"] = cid not in score_map
317
+ retrieved.append(chunk)
318
+
319
+ # Step 4 – sort: vector hits first, then graph expansions
320
+ retrieved.sort(key=lambda x: (-int(not x["from_graph"]), -x["score"]))
321
+ return retrieved[:TOP_K_RETRIEVAL + 4] # slightly more context for QA
322
+
323
+
324
+ # ══════════════════════════════════════════════════════════════════════════════
325
+ # 5. ANALYSIS MODULES
326
+ # ══════════════════════════════════════════════════════════════════════════════
327
+
328
+ def compute_esg_scores(chunks: list[dict]) -> dict:
329
+ """Score E, S, G pillars from keyword density."""
330
+ scores = {"environmental": 0, "social": 0, "governance": 0}
331
+ total_words = 0
332
+ for chunk in chunks:
333
+ words = chunk["text"].lower().split()
334
+ total_words += len(words)
335
+ for pillar, keywords in ESG_CATEGORIES.items():
336
+ scores[pillar] += sum(words.count(kw) for kw in keywords)
337
+
338
+ if total_words == 0:
339
+ return {"environmental": 0, "social": 0, "governance": 0, "overall": 0}
340
+
341
+ # Normalise to 0-100
342
+ max_hits = max(scores.values()) or 1
343
+ norm = {k: round(min(v / max_hits * 100, 100), 1) for k, v in scores.items()}
344
+ norm["overall"] = round(sum(norm.values()) / 3, 1)
345
+ return norm
346
+
347
+
348
+ def detect_sector(chunks: list[dict]) -> tuple[str, str]:
349
+ """Detect industry sector and key risk factors."""
350
+ sector_keywords = {
351
+ "Energy & Utilities": ["oil", "gas", "electricity", "utility", "power plant", "pipeline"],
352
+ "Finance & Banking": ["bank", "investment", "portfolio", "loan", "insurance", "asset"],
353
+ "Technology": ["software", "data center", "cloud", "semiconductor", "hardware"],
354
+ "Manufacturing": ["factory", "manufacturing", "production", "supply chain", "logistics"],
355
+ "Consumer Goods": ["product", "retail", "consumer", "packaging", "brand"],
356
+ "Real Estate": ["property", "building", "construction", "real estate", "infrastructure"],
357
+ "Healthcare": ["health", "pharmaceutical", "medical", "hospital", "clinical"],
358
+ "Agriculture & Food": ["agriculture", "food", "farming", "crop", "livestock"],
359
+ "Transportation": ["transport", "aviation", "shipping", "fleet", "logistics"],
360
+ }
361
+ sector_risk = {
362
+ "Energy & Utilities": "High carbon exposure, stranded asset risk, regulatory transition",
363
+ "Finance & Banking": "ESG credit risk, greenwashing liability, regulatory compliance",
364
+ "Technology": "E-waste, data privacy, supply chain ethics, energy usage",
365
+ "Manufacturing": "Scope 3 emissions, labour rights, waste management",
366
+ "Consumer Goods": "Packaging waste, supply chain transparency, greenwashing",
367
+ "Real Estate": "Building energy efficiency, urban heat islands, climate resilience",
368
+ "Healthcare": "Pharmaceutical waste, access to medicines, clinical trial ethics",
369
+ "Agriculture & Food": "Land use, water scarcity, biodiversity loss, food waste",
370
+ "Transportation": "Fleet emissions, fuel transition, last-mile logistics",
371
+ }
372
+
373
+ text = " ".join(c["text"] for c in chunks).lower()
374
+ sector_hits = {s: sum(text.count(kw) for kw in kws) for s, kws in sector_keywords.items()}
375
+ sector = max(sector_hits, key=sector_hits.get)
376
+ if sector_hits[sector] == 0:
377
+ sector = "General / Diversified"
378
+ risk = "Cross-sector ESG exposure, disclosure quality, stakeholder engagement"
379
+ else:
380
+ risk = sector_risk[sector]
381
+ return sector, risk
382
+
383
+
384
+ def detect_greenwashing(chunks: list[dict]) -> list[dict]:
385
+ """Flag chunks containing unsubstantiated green claims."""
386
+ flags = []
387
+ for chunk in chunks:
388
+ text_lower = chunk["text"].lower()
389
+ matched_kws = [kw for kw in GREENWASHING_KEYWORDS if kw in text_lower]
390
+ if matched_kws:
391
+ # Check whether same chunk or nearby has evidence
392
+ has_evidence = any(
393
+ word in text_lower
394
+ for word in ["verified", "certified", "third party", "audited",
395
+ "iso", "sbti", "science-based", "independently"]
396
+ )
397
+ flags.append({
398
+ "page": chunk["page"],
399
+ "keywords": matched_kws,
400
+ "text_snip": chunk["text"][:220] + ("…" if len(chunk["text"]) > 220 else ""),
401
+ "verified": has_evidence,
402
+ })
403
+ # Deduplicate by page + first keyword
404
+ seen = set()
405
+ unique_flags = []
406
+ for f in flags:
407
+ key = (f["page"], f["keywords"][0])
408
+ if key not in seen:
409
+ seen.add(key)
410
+ unique_flags.append(f)
411
+ return unique_flags
412
+
413
+
414
+ def answer_question(question: str, context_chunks: list[dict]) -> str:
415
+ """Generate an answer using the QA model and retrieved context."""
416
+ context = "\n\n".join(
417
+ f"[Page {c['page']}] {c['text']}" for c in context_chunks
418
+ )
419
+ prompt = (
420
+ f"You are an expert ESG analyst. Answer the question based ONLY on the provided context.\n\n"
421
+ f"Context:\n{context[:3000]}\n\n"
422
+ f"Question: {question}\n\nAnswer:"
423
+ )
424
+ try:
425
+ result = _state["qa_pipeline"](prompt, max_new_tokens=256, do_sample=False)
426
+ return result[0]["generated_text"].strip()
427
+ except Exception as e:
428
+ return f"(Model error: {e})"
429
+
430
+
431
+ # ══════════════════════════════════════════════════════════════════════════════
432
+ # 6. TOP-LEVEL PIPELINE
433
+ # ══════════════════════════════════════════════════════════════════════════════
434
+
435
+ def process_document(pdf_file) -> str:
436
+ """Full document ingestion pipeline."""
437
+ if pdf_file is None:
438
+ return "❌ Please upload a PDF file."
439
+
440
+ load_models()
441
+
442
+ pdf_path = pdf_file.name if hasattr(pdf_file, "name") else str(pdf_file)
443
+ doc_name = Path(pdf_path).name
444
+ doc_id = hashlib.md5(doc_name.encode()).hexdigest()[:8]
445
+
446
+ # 1. Parse
447
+ pages = extract_text_from_pdf(pdf_path)
448
+ if not pages:
449
+ return "❌ Could not extract text from PDF. Try a text-based (not scanned) PDF."
450
+
451
+ # 2. Chunk
452
+ chunks = chunk_pages(pages)
453
+ if not chunks:
454
+ return "❌ Document appears empty after chunking."
455
+
456
+ # 3. Embed
457
+ embeddings = embed_chunks(chunks)
458
+
459
+ # 4. Re-create Qdrant collection (fresh per upload)
460
+ QdrantClient, Distance, VectorParams, *_ = _import_qdrant()
461
+ _state["qdrant_client"].recreate_collection(
462
+ collection_name=COLLECTION_NAME,
463
+ vectors_config=VectorParams(size=VECTOR_DIM, distance=Distance.COSINE),
464
+ )
465
+ index_chunks(chunks, embeddings, doc_id)
466
+
467
+ # 5. Build discourse graph
468
+ _state["discourse_graph"] = build_discourse_graph(chunks)
469
+
470
+ # 6. Store state
471
+ _state["chunks"] = chunks
472
+ _state["doc_id"] = doc_id
473
+ _state["doc_name"] = doc_name
474
+ _state["is_ready"] = True
475
+
476
+ nx = _import_networkx()
477
+ G = _state["discourse_graph"]
478
+ role_counts = {}
479
+ for n, d in G.nodes(data=True):
480
+ role_counts[d["role"]] = role_counts.get(d["role"], 0) + 1
481
+
482
+ return (
483
+ f"βœ… **Document processed successfully!**\n\n"
484
+ f"πŸ“„ **File:** {doc_name}\n"
485
+ f"πŸ“‘ **Pages parsed:** {len(pages)}\n"
486
+ f"πŸ”· **Chunks indexed:** {len(chunks)}\n"
487
+ f"πŸ•ΈοΈ **Discourse graph nodes:** {G.number_of_nodes()} | "
488
+ f"edges: {G.number_of_edges()}\n"
489
+ f"🏷️ **Node roles:** {json.dumps(role_counts)}\n\n"
490
+ f"You can now ask questions, view ESG scores, or run greenwashing detection."
491
+ )
492
+
493
+
494
+ # ══════════════════════════════════════════════════════════════════════════════
495
+ # 7. GRADIO HANDLER FUNCTIONS
496
+ # ══════════════════════════════════════════════════════════════════════════════
497
+
498
+ def handle_question(question: str) -> tuple[str, str]:
499
+ """Answer a user question and return answer + evidence panel."""
500
+ if not _state["is_ready"]:
501
+ return "⚠️ Please upload and process a document first.", ""
502
+ if not question.strip():
503
+ return "⚠️ Please enter a question.", ""
504
+
505
+ retrieved = hyper_rag_retrieve(question)
506
+ answer = answer_question(question, retrieved)
507
+
508
+ evidence_lines = ["### πŸ“Ž Retrieved Evidence (HyperRAG)\n"]
509
+ for i, chunk in enumerate(retrieved, 1):
510
+ tag = "πŸ”΅ vector" if not chunk.get("from_graph") else "🟣 graph"
511
+ evidence_lines.append(
512
+ f"**[{i}] Page {chunk['page']} | score={chunk['score']:.3f} | {tag}**\n"
513
+ f"> {chunk['text'][:300]}…\n"
514
+ )
515
+
516
+ return answer, "\n".join(evidence_lines)
517
+
518
+
519
+ def handle_esg_scores() -> str:
520
+ """Return ESG score panel."""
521
+ if not _state["is_ready"]:
522
+ return "⚠️ Please upload and process a document first."
523
+
524
+ scores = compute_esg_scores(_state["chunks"])
525
+ sector, risk = detect_sector(_state["chunks"])
526
+
527
+ bar = lambda v: "β–ˆ" * int(v / 5) + "β–‘" * (20 - int(v / 5))
528
+
529
+ return (
530
+ f"## πŸ“Š ESG Score Analysis β€” *{_state['doc_name']}*\n\n"
531
+ f"| Pillar | Score | Bar |\n"
532
+ f"|--------|-------|-----|\n"
533
+ f"| 🌿 Environmental | {scores['environmental']:.1f}/100 | `{bar(scores['environmental'])}` |\n"
534
+ f"| πŸ‘₯ Social | {scores['social']:.1f}/100 | `{bar(scores['social'])}` |\n"
535
+ f"| πŸ›οΈ Governance | {scores['governance']:.1f}/100 | `{bar(scores['governance'])}` |\n"
536
+ f"| ⭐ **Overall** | **{scores['overall']:.1f}/100** | `{bar(scores['overall'])}` |\n\n"
537
+ f"---\n"
538
+ f"### 🏭 Sector Detection\n"
539
+ f"**Identified Sector:** {sector}\n\n"
540
+ f"**Key Risk Factors:** {risk}\n\n"
541
+ f"> *Scores are keyword-density proxies. For investment decisions, use certified ESG ratings.*"
542
+ )
543
+
544
+
545
+ def handle_greenwashing() -> str:
546
+ """Return greenwashing detection report."""
547
+ if not _state["is_ready"]:
548
+ return "⚠️ Please upload and process a document first."
549
+
550
+ flags = detect_greenwashing(_state["chunks"])
551
+ if not flags:
552
+ return (
553
+ "βœ… **No greenwashing keywords detected** in this document.\n\n"
554
+ "The report does not contain common unsubstantiated sustainability claims."
555
+ )
556
+
557
+ unverified = [f for f in flags if not f["verified"]]
558
+ verified = [f for f in flags if f["verified"]]
559
+
560
+ lines = [
561
+ f"## 🚨 Greenwashing Detection Report β€” *{_state['doc_name']}*\n",
562
+ f"**Total flagged claims:** {len(flags)} "
563
+ f"({len(unverified)} unverified ⚠️ | {len(verified)} with evidence βœ…)\n\n---\n",
564
+ ]
565
+
566
+ if unverified:
567
+ lines.append("### ⚠️ Unverified Claims (Higher Risk)\n")
568
+ for f in unverified:
569
+ kws = ", ".join(f"**{k}**" for k in f["keywords"])
570
+ lines.append(
571
+ f"πŸ“ **Page {f['page']}** β€” Keywords: {kws}\n"
572
+ f"> {f['text_snip']}\n"
573
+ )
574
+
575
+ if verified:
576
+ lines.append("\n### βœ… Claims With Supporting Evidence\n")
577
+ for f in verified:
578
+ kws = ", ".join(f"**{k}**" for k in f["keywords"])
579
+ lines.append(
580
+ f"πŸ“ **Page {f['page']}** β€” Keywords: {kws}\n"
581
+ f"> {f['text_snip']}\n"
582
+ )
583
+
584
+ lines.append(
585
+ "\n---\n*Greenwashing detection is keyword-based. "
586
+ "Human expert review is recommended for investment decisions.*"
587
+ )
588
+ return "\n".join(lines)
589
+
590
+
591
+ def handle_graph_insights() -> str:
592
+ """Return discourse graph statistics and top relations."""
593
+ if not _state["is_ready"]:
594
+ return "⚠️ Please upload and process a document first."
595
+
596
+ nx = _import_networkx()
597
+ G = _state["discourse_graph"]
598
+
599
+ role_counts = {}
600
+ for n, d in G.nodes(data=True):
601
+ role_counts[d["role"]] = role_counts.get(d["role"], 0) + 1
602
+
603
+ edge_counts = {}
604
+ for u, v, d in G.edges(data=True):
605
+ rel = d.get("relation", "unknown")
606
+ edge_counts[rel] = edge_counts.get(rel, 0) + 1
607
+
608
+ # Top connected nodes
609
+ degree_seq = sorted(G.degree(), key=lambda x: x[1], reverse=True)[:5]
610
+ top_nodes = []
611
+ for nid, deg in degree_seq:
612
+ ndata = G.nodes[nid]
613
+ top_nodes.append(
614
+ f"- Chunk {nid} | Page {ndata['page']} | Role: `{ndata['role']}` | Degree: {deg}"
615
+ )
616
+
617
+ return (
618
+ f"## πŸ•ΈοΈ Discourse Graph Insights β€” *{_state['doc_name']}*\n\n"
619
+ f"### Graph Statistics\n"
620
+ f"- **Nodes (chunks):** {G.number_of_nodes()}\n"
621
+ f"- **Edges (relations):** {G.number_of_edges()}\n\n"
622
+ f"### Node Roles\n"
623
+ + "\n".join(f"- `{r}`: {c}" for r, c in sorted(role_counts.items(), key=lambda x: -x[1]))
624
+ + "\n\n### Relation Types\n"
625
+ + "\n".join(f"- `{r}`: {c}" for r, c in sorted(edge_counts.items(), key=lambda x: -x[1]))
626
+ + "\n\n### πŸ”— Most Connected Chunks (Hub Nodes)\n"
627
+ + "\n".join(top_nodes)
628
+ + "\n\n> *Hub nodes represent cross-referenced ESG statements β€” "
629
+ "key evidence or policy anchors in the report.*"
630
+ )
631
+
632
+
633
+ # ══════════════════════════════════════════════════════════════════════════════
634
+ # 8. GRADIO UI
635
+ # ══════════════════════════════════════════════════════════════════════════════
636
+
637
+ CSS = """
638
+ .gr-button-primary { background: #1a6b3c !important; }
639
+ .gr-button { border-radius: 8px !important; }
640
+ #title { text-align: center; margin-bottom: 0.5em; }
641
+ #subtitle { text-align: center; color: #666; margin-bottom: 1.5em; }
642
+ """
643
+
644
+ def build_ui():
645
+ with gr.Blocks(css=CSS, title="ESG Intelligence Platform") as demo:
646
+
647
+ gr.Markdown(
648
+ "# 🌿 Multimodal ESG Document Intelligence Platform\n"
649
+ "### HyperRAG + Discourse Graph Reasoning",
650
+ elem_id="title"
651
+ )
652
+ gr.Markdown(
653
+ "Upload an ESG report (PDF) to unlock **semantic Q&A**, "
654
+ "**ESG scoring**, **greenwashing detection**, and **graph-based reasoning**.",
655
+ elem_id="subtitle"
656
+ )
657
+
658
+ # ── Upload Tab ─────────────────────────────────────────────────────
659
+ with gr.Tab("πŸ“€ Upload & Process"):
660
+ with gr.Row():
661
+ with gr.Column(scale=2):
662
+ pdf_input = gr.File(label="Upload ESG Report (PDF)", file_types=[".pdf"])
663
+ upload_btn = gr.Button("βš™οΈ Process Document", variant="primary")
664
+ with gr.Column(scale=3):
665
+ upload_out = gr.Markdown(
666
+ "Upload a PDF and click **Process Document** to begin."
667
+ )
668
+ upload_btn.click(process_document, inputs=pdf_input, outputs=upload_out)
669
+
670
+ # ── Q&A Tab ────────────────────────────────────────────────────────
671
+ with gr.Tab("πŸ’¬ Ask Questions (HyperRAG)"):
672
+ gr.Markdown(
673
+ "Ask any question about the ESG report. "
674
+ "The HyperRAG pipeline combines **vector search** with "
675
+ "**discourse graph expansion** for multi-hop reasoning."
676
+ )
677
+ question_input = gr.Textbox(
678
+ placeholder="e.g. What are the company's carbon reduction targets?",
679
+ label="Your Question",
680
+ lines=2,
681
+ )
682
+ ask_btn = gr.Button("πŸ” Ask", variant="primary")
683
+ answer_md = gr.Markdown(label="Answer")
684
+ evid_md = gr.Markdown(label="Supporting Evidence")
685
+
686
+ # Example questions
687
+ gr.Examples(
688
+ examples=[
689
+ ["What are the company's Scope 1, 2, and 3 emissions?"],
690
+ ["What diversity and inclusion initiatives are mentioned?"],
691
+ ["What governance policies are in place for risk management?"],
692
+ ["What renewable energy targets has the company set?"],
693
+ ["How does the company address human rights in its supply chain?"],
694
+ ],
695
+ inputs=question_input,
696
+ )
697
+ ask_btn.click(handle_question, inputs=question_input, outputs=[answer_md, evid_md])
698
+
699
+ # ── ESG Scores Tab ─────────────────────────────────────────────────
700
+ with gr.Tab("πŸ“Š ESG Scores & Sector"):
701
+ score_btn = gr.Button("πŸ“ˆ Compute ESG Scores", variant="primary")
702
+ score_out = gr.Markdown()
703
+ score_btn.click(handle_esg_scores, outputs=score_out)
704
+
705
+ # ── Greenwashing Tab ───────────────────────────────────────────────
706
+ with gr.Tab("🚨 Greenwashing Detection"):
707
+ gr.Markdown(
708
+ "Detects unsubstantiated sustainability claims and links them "
709
+ "to **exact page numbers** in the report."
710
+ )
711
+ gw_btn = gr.Button("πŸ”Ž Detect Greenwashing Claims", variant="primary")
712
+ gw_out = gr.Markdown()
713
+ gw_btn.click(handle_greenwashing, outputs=gw_out)
714
+
715
+ # ── Graph Tab ──────────────────────────────────────────────────────
716
+ with gr.Tab("πŸ•ΈοΈ Discourse Graph"):
717
+ gr.Markdown(
718
+ "View the **discourse graph** that models logical relationships "
719
+ "(claims, evidence, policies, metrics) between ESG statements."
720
+ )
721
+ graph_btn = gr.Button("πŸ”¬ Analyse Discourse Graph", variant="primary")
722
+ graph_out = gr.Markdown()
723
+ graph_btn.click(handle_graph_insights, outputs=graph_out)
724
+
725
+ # ── About Tab ──────────────────────────────────────────────────────
726
+ with gr.Tab("ℹ️ About"):
727
+ gr.Markdown("""
728
+ ## About This Platform
729
+
730
+ This prototype implements the **Multimodal ESG Document Intelligence Platform**
731
+ combining three advanced AI techniques:
732
+
733
+ ### πŸ”· Architecture
734
+
735
+ | Component | Technology |
736
+ |-----------|------------|
737
+ | Vector Store | Qdrant (in-memory) |
738
+ | Embeddings | `sentence-transformers/all-MiniLM-L6-v2` |
739
+ | Language Model | `google/flan-t5-base` |
740
+ | Graph Engine | NetworkX DiGraph |
741
+ | Retrieval | HyperRAG (vector + graph) |
742
+ | Interface | Gradio |
743
+
744
+ ### πŸ”· Pipeline
745
+
746
+ ```
747
+ PDF Upload
748
+ ↓
749
+ Text Extraction (pdfplumber, page-aware)
750
+ ↓
751
+ Chunking with Overlap
752
+ ↓
753
+ Sentence-Transformer Embeddings
754
+ ↓
755
+ Qdrant Vector Index
756
+ ↓
757
+ Discourse Graph Construction
758
+ (claims β†’ evidence β†’ policies β†’ metrics)
759
+ ↓
760
+ HyperRAG Retrieval
761
+ (vector search + graph neighbourhood expansion)
762
+ ↓
763
+ Flan-T5 Answer Generation
764
+ ```
765
+
766
+ ### πŸ”· Outputs
767
+ - **Q&A** with page-level evidence
768
+ - **ESG pillar scores** (E, S, G + overall)
769
+ - **Sector detection** and risk factors
770
+ - **Greenwashing flags** linked to pages
771
+ - **Discourse graph** statistics and hub nodes
772
+
773
+ ### πŸ”· Limitations
774
+ - Scores are keyword-density heuristics, not certified ratings
775
+ - Model is `flan-t5-base` for CPU compatibility; upgrade to larger models for production
776
+ - Greenwashing detection is pattern-based and requires expert validation
777
+
778
+ *Built for research and demonstration purposes.*
779
+ """)
780
+
781
+ return demo
782
+
783
+
784
+ # ══════════════════════════════════════════════════════════════════════════════
785
+ # 9. ENTRY POINT
786
+ # ══════════════════════════════════════════════════════════════════════════════
787
+
788
+ if __name__ == "__main__":
789
+ demo = build_ui()
790
+ demo.launch(
791
+ server_name="0.0.0.0",
792
+ server_port=7860,
793
+ share=False,
794
+ )