Major_Project / app.py
riyasuryawanshi746's picture
Fixed PDF preprocessing and clause segmentation
6893de4 verified
# app.py
# ClauseXplain v5.3 — hardening pass
# Changes vs v5.2:
# • ICA_007 (uncapped indemnity) now requires has_uncapped_signal — no more
# auto-firing on every "indemnify" mention
# • analyze_document uses level_from_score() from inference.py (single source
# of truth for the new 0.50 / 0.80 risk-level cutoffs)
from __future__ import annotations
import os
import re
import gc
import tempfile
import threading
import numpy as np
import torch
import torch.nn as nn
import gradio as gr
from pathlib import Path
from transformers import LongformerTokenizer, LongformerModel
from sklearn.preprocessing import MultiLabelBinarizer
from huggingface_hub import hf_hub_download
from feature_extractor import ClauseFeatureExtractor
from explanation import generate_explanation
from inference import level_from_score # v5.3: single source of truth
from utils import highlight_keywords
# ── Optional / fail-soft integrations ─────────────────────────────────────────
try:
from nl_summary import NLSummarizer
nl_summarizer = NLSummarizer()
except Exception as _e:
print(f"[WARN] NLSummarizer disabled: {_e}")
nl_summarizer = None
try:
from local_interpreters import LocalExplainer, build_predict_fn_for_manager
local_explainer = LocalExplainer(num_samples=25, timeout_seconds=25.0)
except Exception as _e:
print(f"[WARN] LocalExplainer disabled: {_e}")
local_explainer = None
build_predict_fn_for_manager = None
try:
from attention_visualization import attention_heatmap_html, lime_html
except Exception as _e:
print(f"[WARN] attention_visualization disabled: {_e}")
def attention_heatmap_html(*_a, **_k): return ""
def lime_html(*_a, **_k): return ""
try:
from report import generate_report
except Exception as _e:
print(f"[WARN] report disabled: {_e}")
generate_report = None
DEVICE = torch.device("cpu")
CLAUSE_CLASSES = [
"Cap On Liability", "Change Of Control", "Covenant Not To Sue",
"Exclusivity", "Governing Law", "IP Ownership Assignment",
"Irrevocable Or Perpetual License", "Joint IP Ownership",
"License Grant", "Liquidated Damages", "Minimum Commitment",
"Most Favored Nation", "No-Solicit Of Customers", "No-Solicit Of Employees",
"Non-Compete", "Notice Period To Terminate Renewal", "Post-Termination Services",
"Price Restrictions", "Revenue/Profit Sharing", "Renewal Term",
"Source Code Escrow", "Uncapped Liability", "Unlimited/All-You-Can-Eat-License",
"Volume Restriction", "Warranty Duration",
"Anti-Assignment", "Audit Rights", "Competitive Restriction Exception",
"Expiration Date", "Insurance", "Ip Indemnification",
"Limitation Of Liability", "Non-Disparagement", "Parties",
"Permitted Development", "Rofr/Rofo/Rofn", "Third Party Beneficiary",
"Termination For Convenience", "Affiliate License-Licensor",
"Affiliate License-Licensee", "Agreement Date",
]
RISK_CLASSES = ["ambiguity", "enforceability", "financial", "ip", "structural"]
# ─────────────────────────────────────────────────────────────────────────────
# Symbolic rules — v5.3 tightened
# ─────────────────────────────────────────────────────────────────────────────
SYMBOLIC_RULES = [
{"rule_id": "ICA_001", "name": "Unconscionable Liability Cap",
"reference": "Indian Contract Act 1872, S.23", "penalty": 0.45, "category": "financial",
"condition": lambda f: f.get("has_liability_cap") and f.get("excludes_gross_negligence")},
{"rule_id": "ICA_002", "name": "Unilateral Termination Without Notice",
"reference": "Indian Contract Act 1872, S.39", "penalty": 0.35, "category": "enforceability",
"condition": lambda f: f.get("unilateral_termination") and not f.get("notice_period_defined")},
{"rule_id": "ICA_003", "name": "Non-Compete Exceeding 2 Years",
"reference": "Indian Contract Act 1872, S.27", "penalty": 0.55, "category": "enforceability",
"condition": lambda f: f.get("non_compete_years", 0) > 2},
{"rule_id": "ICA_004", "name": "Penalty Clause Exceeds Actual Damage",
"reference": "Indian Contract Act 1872, S.74", "penalty": 0.40, "category": "financial",
"condition": lambda f: f.get("has_liquidated_damages") and f.get("damages_exceed_loss")},
# ICA_005: only fires on explicit gambling vocab — no more "contingent on closing"
{"rule_id": "ICA_005", "name": "Wagering / Gambling Agreement",
"reference": "Indian Contract Act 1872, S.30", "penalty": 0.70, "category": "enforceability",
"condition": lambda f: f.get("is_wagering_clause")},
{"rule_id": "ICA_006", "name": "Restraint of Legal Proceedings",
"reference": "Indian Contract Act 1872, S.28", "penalty": 0.60, "category": "enforceability",
"condition": lambda f: f.get("restrains_legal_proceedings")},
# ICA_007 TIGHTENED: indemnity + explicit uncapped signal + no cap
{"rule_id": "ICA_007", "name": "Uncapped Indemnity Obligation",
"reference": "Indian Contract Act 1872, S.124", "penalty": 0.50, "category": "financial",
"condition": lambda f: (
f.get("has_indemnity_clause")
and f.get("has_uncapped_signal")
and not f.get("indemnity_capped")
)},
{"rule_id": "ICA_008", "name": "Auto-Renewal Without Opt-Out Window",
"reference": "Indian Contract Act 1872 + CPA 2019", "penalty": 0.35, "category": "enforceability",
"condition": lambda f: f.get("has_auto_renewal") and not f.get("has_opt_out_window")},
{"rule_id": "ICA_009", "name": "Arbitration in Distant Venue",
"reference": "Arbitration and Conciliation Act 1996, S.20", "penalty": 0.40, "category": "enforceability",
"condition": lambda f: f.get("has_arbitration") and f.get("arbitration_distant_venue")},
# ICA_010 narrowed via tightened has_exclusivity patterns in feature_extractor
{"rule_id": "ICA_010", "name": "Indefinite Exclusivity",
"reference": "Indian Contract Act 1872, S.27", "penalty": 0.50, "category": "enforceability",
"condition": lambda f: f.get("has_exclusivity") and not f.get("exclusivity_term_defined")},
{"rule_id": "ICA_011", "name": "Unilateral Price Modification",
"reference": "Indian Contract Act 1872, S.62 + CPA 2019", "penalty": 0.45, "category": "financial",
"condition": lambda f: f.get("unilateral_price_change")},
{"rule_id": "DPDPA_001", "name": "Missing Data Retention Clause",
"reference": "DPDPA 2023, S.8(7)", "penalty": 0.50, "category": "compliance",
"condition": lambda f: f.get("processes_personal_data") and not f.get("has_data_retention_clause")},
{"rule_id": "DPDPA_002", "name": "Broad Pre-existing IP Assignment",
"reference": "ICA 1872, S.27 + DPDPA 2023", "penalty": 0.40, "category": "ip",
"condition": lambda f: f.get("assigns_all_ip") and f.get("includes_pre_existing_ip")},
{"rule_id": "DPDPA_003", "name": "No Data Principal Consent Mechanism",
"reference": "DPDPA 2023, S.6", "penalty": 0.55, "category": "compliance",
"condition": lambda f: f.get("processes_sensitive_data") and not f.get("has_consent_clause")},
{"rule_id": "DPDPA_004", "name": "No Data Breach Notification Clause",
"reference": "DPDPA 2023, S.8(6)", "penalty": 0.45, "category": "compliance",
"condition": lambda f: f.get("processes_personal_data") and not f.get("has_breach_notification")},
{"rule_id": "ITA_001", "name": "No Cybersecurity Obligation",
"reference": "IT Act 2000, S.43A", "penalty": 0.35, "category": "compliance",
"condition": lambda f: f.get("handles_digital_data") and not f.get("has_security_clause")},
{"rule_id": "CPA_001", "name": "Unfair Contract Term (Consumer)",
"reference": "Consumer Protection Act 2019, S.2(46)", "penalty": 0.50, "category": "enforceability",
"condition": lambda f: f.get("is_consumer_contract") and f.get("has_one_sided_clause")},
]
# ─────────────────────────────────────────────────────────────────────────────
# Model (unchanged)
# ─────────────────────────────────────────────────────────────────────────────
class ClauseXplainV5(nn.Module):
def __init__(self, num_clause_labels: int, num_risk_labels: int):
super().__init__()
self.encoder = LongformerModel.from_pretrained("allenai/longformer-base-4096")
hidden = self.encoder.config.hidden_size
self.dropout = nn.Dropout(0.1)
self.clause_head = nn.Linear(hidden, num_clause_labels)
self.risk_head = nn.Linear(hidden, num_risk_labels)
self.risk_fusion = nn.Sequential(
nn.Linear(hidden + num_risk_labels + num_clause_labels, 256),
nn.ReLU(), nn.Dropout(0.1), nn.Linear(256, 1),
)
self.risk_level_classifier = nn.Linear(1, 3)
def forward(self, input_ids, attention_mask):
out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
pooled = self.dropout(out.last_hidden_state[:, 0])
clause_logits = self.clause_head(pooled)
risk_logits = self.risk_head(pooled)
clause_probs = torch.sigmoid(clause_logits)
risk_probs = torch.sigmoid(risk_logits)
fusion_input = torch.cat([pooled, risk_probs, clause_probs], dim=1)
risk_score = torch.sigmoid(self.risk_fusion(fusion_input))
risk_level_logits = self.risk_level_classifier(risk_score)
return clause_logits, risk_logits, risk_score, risk_level_logits, pooled
# ─────────────────────────────────────────────────────────────────────────────
# ModelManager
# ─────────────────────────────────────────────────────────────────────────────
class ModelManager:
def __init__(self):
self.model: ClauseXplainV5 | None = None
self.tokenizer: LongformerTokenizer | None = None
self.clause_mlb: MultiLabelBinarizer | None = None
self.risk_mlb: MultiLabelBinarizer | None = None
self.feature_extractor = ClauseFeatureExtractor()
self.is_ready: bool = False
self.load_error: str = ""
self._lock = threading.Lock()
self._load_attempted: bool = False
def _build_mlbs(self):
clause_mlb = MultiLabelBinarizer(classes=CLAUSE_CLASSES)
clause_mlb.fit([[c] for c in CLAUSE_CLASSES])
risk_mlb = MultiLabelBinarizer(classes=RISK_CLASSES)
risk_mlb.fit([[r] for r in RISK_CLASSES])
return clause_mlb, risk_mlb
def ensure_loaded(self):
if self._load_attempted:
return
with self._lock:
if self._load_attempted:
return
self._load_attempted = True
self._do_load()
def _do_load(self):
try:
print("[INFO] Loading tokenizer…")
self.tokenizer = LongformerTokenizer.from_pretrained(
"allenai/longformer-base-4096"
)
self.clause_mlb, self.risk_mlb = self._build_mlbs()
print("[INFO] Building model architecture…")
self.model = ClauseXplainV5(
num_clause_labels=len(self.clause_mlb.classes_),
num_risk_labels=len(self.risk_mlb.classes_),
)
print("[INFO] Downloading checkpoint from HuggingFace Hub…")
ckpt_path = hf_hub_download(
repo_id="riyasuryawanshi746/clauseXplain",
filename="clausexplain_v5_best.pt",
)
checkpoint = torch.load(
ckpt_path,
map_location=torch.device("cpu"),
weights_only=False,
mmap=True,
)
if isinstance(checkpoint, dict) and "model_state" in checkpoint:
state_dict = checkpoint["model_state"]
elif isinstance(checkpoint, dict) and "state_dict" in checkpoint:
state_dict = checkpoint["state_dict"]
else:
state_dict = checkpoint
cleaned = {k.replace("module.", "", 1): v for k, v in state_dict.items()}
self.model.load_state_dict(cleaned, strict=False)
self.model.eval()
del checkpoint, cleaned
gc.collect()
self.is_ready = True
print("[INFO] ✓ Model loaded and ready (CPU mode)")
except Exception as e:
self.load_error = str(e)
self.is_ready = False
print(f"[ERROR] Model load failed: {e}")
def analyze_clause(self, text: str) -> dict:
from inference import (
_symbolic_rule_score, _neuro_symbolic_fusion,
_compute_confidence, IP_CLAUSE_TYPES,
)
self.ensure_loaded()
features, evidence = self.feature_extractor.extract(text)
sym_result = _symbolic_rule_score(features, SYMBOLIC_RULES)
# v5.4: Sanity-check the clause text before inference.
# After pdf_utils filtering, this should always be real legal prose.
word_count = len(text.split())
print(f"[DEBUG] analyze_clause: {word_count} words | "
f"preview: {text[:80].replace(chr(10),' ')!r}")
if self.is_ready and self.model is not None:
try:
enc = self.tokenizer(
text, padding="max_length", truncation=True,
max_length=256, return_tensors="pt",
)
with torch.no_grad():
clause_logits, risk_logits, risk_score_tensor, _, _ = self.model(
enc["input_ids"], enc["attention_mask"],
)
clause_probs = torch.sigmoid(clause_logits).numpy()[0]
top3_idx = clause_probs.argsort()[::-1][:3]
top_clauses = [
(self.clause_mlb.classes_[i], round(float(clause_probs[i]), 3))
for i in top3_idx if clause_probs[i] > 0.05
]
risk_probs = torch.sigmoid(risk_logits).numpy()[0]
top2_idx = risk_probs.argsort()[::-1][:2]
top_risks = [
(self.risk_mlb.classes_[i], round(float(risk_probs[i]), 3))
for i in top2_idx if risk_probs[i] > 0.05
]
neural_score = round(float(risk_score_tensor.item()), 3)
top_clause_name = top_clauses[0][0] if top_clauses else ""
is_ip = top_clause_name in IP_CLAUSE_TYPES
neural_loaded = True
except Exception as e:
print(f"[WARN] Neural inference failed: {e}")
neural_score, top_clauses, top_risks, is_ip = 0.0, [], [], False
neural_loaded = False
else:
neural_score, top_clauses, top_risks, is_ip = 0.0, [], [], False
neural_loaded = False
fusion = _neuro_symbolic_fusion(neural_score, sym_result["symbolic_score"], is_ip)
confidence = _compute_confidence(
neural=neural_score, symbolic=sym_result["symbolic_score"],
fused=fusion["score"], num_triggered=len(sym_result["triggered_rules"]),
neural_loaded=neural_loaded,
)
triggered_clean = [
{"rule_id": r["rule_id"], "name": r["name"],
"reference": r["reference"], "penalty": r["penalty"],
"category": r["category"]}
for r in sym_result["triggered_rules"]
]
return {
"risk_score": fusion["score"],
"neural_score": neural_score,
"symbolic_score": sym_result["symbolic_score"],
"risk_level": f"{fusion['emoji']} {fusion['level']}",
"risk_level_raw": fusion["level"],
"top_clauses": top_clauses,
"top_risk_cats": top_risks,
"triggered_rules": triggered_clean,
"features": {k: v for k, v in features.items() if v},
"evidence": evidence,
"score_breakdown": fusion["breakdown"],
"confidence": confidence,
}
def analyze_document(self, text: str, max_clauses: int = 50) -> dict:
from pdf_utils import split_into_clauses_with_metadata
clauses_meta = split_into_clauses_with_metadata(text)[:max_clauses]
if not clauses_meta:
clauses_meta = [{"text": text[:2000], "number": None, "kind": "paragraph"}]
results = []
for idx, meta in enumerate(clauses_meta):
clause_text = meta["text"]
try:
r = self.analyze_clause(clause_text)
except Exception as e:
print(f"[WARN] Clause {idx+1} failed: {e}")
r = {
"risk_score": 0.0, "neural_score": 0.0, "symbolic_score": 0.0,
"risk_level": "🟢 Low", "risk_level_raw": "Low",
"top_clauses": [], "top_risk_cats": [],
"triggered_rules": [], "features": {}, "evidence": {},
"score_breakdown": None, "confidence": None,
}
r["clause_index"] = idx + 1
r["clause_text"] = clause_text
r["clause_number"] = meta.get("number")
r["clause_kind"] = meta.get("kind")
results.append(r)
scores = [r["risk_score"] for r in results]
overall = round(0.70 * max(scores) + 0.30 * (sum(scores) / len(scores)), 3)
# v5.3: single source of truth for thresholds
level, _ = level_from_score(overall)
return {
"overall_risk": overall,
"overall_level": level,
"num_clauses": len(results),
"top_risks": sorted(results, key=lambda x: x["risk_score"], reverse=True)[:3],
"clauses": results,
}
manager = ModelManager()
# ═══════════════════════════════════════════════════════════════════════════════
# UI helpers (unchanged from v5.2)
# ═══════════════════════════════════════════════════════════════════════════════
LEVEL_COLOR = {"Low": "🟢", "Medium": "🟡", "High": "🔴"}
LEVEL_HEX = {"Low": "#10b981", "Medium": "#f59e0b", "High": "#ef4444"}
CONF_HEX = {"Low": "#f87171", "Medium": "#fbbf24", "High": "#34d399"}
CAT_ICON = {"financial": "💰", "enforceability": "⚖️", "compliance": "🛡️",
"ip": "🧠", "structural": "🏗️", "ambiguity": "❓"}
def _risk_gauge_html(pct: int, level: str) -> str:
color = LEVEL_HEX.get(level, "#6b7280")
dash = round(169.6 * pct / 100, 1)
return f"""
<div style="display:flex;flex-direction:column;align-items:center;gap:4px">
<svg width="140" height="80" viewBox="0 0 140 80">
<path d="M 14 70 A 56 56 0 0 1 126 70"
fill="none" stroke="#1e293b" stroke-width="14" stroke-linecap="round"/>
<path d="M 14 70 A 56 56 0 0 1 126 70"
fill="none" stroke="{color}" stroke-width="14" stroke-linecap="round"
stroke-dasharray="{dash} 169.6"/>
<text x="70" y="66" text-anchor="middle" font-size="22" font-weight="700"
font-family="'DM Mono',monospace" fill="{color}">{pct}%</text>
</svg>
<span style="font-size:12px;font-weight:600;letter-spacing:.08em;
color:{color};text-transform:uppercase">{level} RISK</span>
</div>"""
def _mini_bar(pct: int, level: str) -> str:
color = LEVEL_HEX.get(level, "#6b7280")
return (f'<div style="background:#1e293b;border-radius:4px;height:6px;width:100%">'
f'<div style="background:{color};width:{pct}%;height:6px;border-radius:4px"></div></div>')
def _confidence_badge(confidence) -> str:
if not confidence:
return ""
lvl = confidence.get("level", "Medium")
pct = int(confidence.get("score", 0) * 100)
col = CONF_HEX.get(lvl, "#94a3b8")
return (f'<span class="cx-badge" style="background:{col}22;color:{col}">'
f'CONF · {lvl.upper()} {pct}%</span>')
def _evidence_pills(evidence_dict: dict) -> str:
if not evidence_dict:
return ""
seen, pills = set(), []
for feat, hits in evidence_dict.items():
for h in hits:
phrase = h.get("phrase", "").strip()
key = phrase.lower()
if phrase and key not in seen:
seen.add(key)
pills.append(f'<span class="cx-ev-pill">“{phrase}”</span>')
if len(pills) >= 6: break
if len(pills) >= 6: break
return "".join(pills)
def _score_breakdown_html(breakdown) -> str:
if not breakdown:
return ""
w = breakdown["weights"]
return f"""
<div class="cx-breakdown">
<div class="cx-bd-row">
<span class="cx-bd-k">Neural</span>
<span class="cx-bd-v">{breakdown['neural_score']:.3f}</span>
<span class="cx-bd-w">× {w['neural']:.2f}</span>
</div>
<div class="cx-bd-row">
<span class="cx-bd-k">Symbolic</span>
<span class="cx-bd-v">{breakdown['symbolic_score']:.3f}</span>
<span class="cx-bd-w">× {w['symbolic']:.2f}</span>
</div>
<hr class="cx-bd-sep"/>
<div class="cx-bd-formula">{breakdown['formula']}</div>
<div class="cx-bd-final">
<span class="cx-bd-k">Final</span>
<span class="cx-bd-final-v">{breakdown['final']:.3f}</span>
</div>
</div>"""
# ═══════════════════════════════════════════════════════════════════════════════
# Analysis flow (unchanged structurally)
# ═══════════════════════════════════════════════════════════════════════════════
def _run_analysis(text: str):
if not text or len(text.strip()) < 30:
return None, "⚠️ Input too short — please paste at least one full clause."
try:
return manager.analyze_document(text), ""
except Exception as e:
return None, f"❌ Analysis error: {e}"
def analyze_pdf(pdf_file):
if pdf_file is None:
return _empty_outputs("No file uploaded.")
from pdf_utils import extract_text_from_pdf
try:
text = extract_text_from_pdf(pdf_file)
except Exception as e:
return _empty_outputs(f"❌ PDF read error: {e}")
return _build_outputs(text)
def analyze_text(raw_text: str):
return _build_outputs(raw_text)
def _empty_outputs(msg: str):
html = f'<div class="cx-empty">{msg}</div>'
return html, html, "", gr.update(choices=[], value=None), None, gr.update(visible=False, value=None)
def _build_outputs(text: str):
doc, err = _run_analysis(text)
if doc is None:
return _empty_outputs(err)
overall_level = doc["overall_level"]
overall_score = doc["overall_risk"]
num_clauses = doc["num_clauses"]
pct = int(overall_score * 100)
high_n = sum(1 for r in doc["clauses"] if r["risk_level_raw"] == "High")
med_n = sum(1 for r in doc["clauses"] if r["risk_level_raw"] == "Medium")
low_n = sum(1 for r in doc["clauses"] if r["risk_level_raw"] == "Low")
model_note = ""
if not manager.is_ready and manager._load_attempted:
model_note = (f'<div class="cx-note cx-warn">⚠️ Neural model unavailable — '
f'<code>{manager.load_error[:100]}</code>. Symbolic only.</div>')
if nl_summarizer is None or not nl_summarizer.enabled:
gem_status = nl_summarizer.last_error if nl_summarizer else "module missing"
model_note += (f'<div class="cx-note">ℹ️ Gemini summaries disabled '
f'({gem_status}). Template summaries will be used.</div>')
gauge = _risk_gauge_html(pct, overall_level)
summary_html = f"""
<div class="cx-summary-grid">
<div class="cx-card cx-gauge-card">{gauge}</div>
<div class="cx-card cx-stat-card"><div class="cx-stat-label">Clauses</div>
<div class="cx-stat-val">{num_clauses}</div></div>
<div class="cx-card cx-stat-card"><div class="cx-stat-label">🔴 High</div>
<div class="cx-stat-val" style="color:#ef4444">{high_n}</div></div>
<div class="cx-card cx-stat-card"><div class="cx-stat-label">🟡 Medium</div>
<div class="cx-stat-val" style="color:#f59e0b">{med_n}</div></div>
<div class="cx-card cx-stat-card"><div class="cx-stat-label">🟢 Low</div>
<div class="cx-stat-val" style="color:#10b981">{low_n}</div></div>
</div>
{model_note}"""
top_parts = ['<div class="cx-section-title">🔥 Top Risk Clauses</div>',
'<div class="cx-top-grid">']
for r in doc["top_risks"]:
lvl = r["risk_level_raw"]
color = LEVEL_HEX.get(lvl, "#6b7280")
cpct = int(r["risk_score"] * 100)
bar = _mini_bar(cpct, lvl)
preview = highlight_keywords(r["clause_text"][:220].replace("\n", " "))
conf_html = _confidence_badge(r.get("confidence"))
clause_no = r.get("clause_number")
no_str = f' · {clause_no}' if clause_no else ''
pills = "".join(
f'<span class="cx-pill" style="border-color:{color}33;color:{color}">'
f'{CAT_ICON.get(rule["category"],"⚠️")} {rule["rule_id"]}</span>'
for rule in r["triggered_rules"][:3]
) or '<span class="cx-pill-none">No violations</span>'
top_parts.append(f"""
<div class="cx-clause-card" style="border-left:3px solid {color}">
<div class="cx-clause-header">
<span class="cx-clause-num">#{r['clause_index']}{no_str}</span>
<span class="cx-badge" style="background:{color}22;color:{color}">{lvl}</span>
{conf_html}
<span class="cx-score-label">{cpct}%</span>
</div>
<div class="cx-bar-wrap">{bar}</div>
<div class="cx-clause-preview">{preview}{'…' if len(r['clause_text']) > 220 else ''}</div>
<div class="cx-pills">{pills}</div>
</div>""")
top_parts.append("</div>")
top_html = "\n".join(top_parts)
rows = [
"## 📄 All Clauses\n",
"| # | Marker | Level | Score | Confidence | Symbolic | Preview |",
"|---|--------|-------|-------|------------|----------|---------|",
]
for r in doc["clauses"]:
preview = r["clause_text"][:55].replace("\n", " ").replace("|", "|")
conf = r.get("confidence") or {}
clvl = conf.get("level", "—")
cscore = int(conf.get("score", 0) * 100) if conf else 0
marker = r.get("clause_number") or "—"
rows.append(
f"| {r['clause_index']} | {marker} | "
f"{LEVEL_COLOR.get(r['risk_level_raw'],'⚪')} {r['risk_level_raw']} | "
f"`{int(r['risk_score']*100)}%` | `{clvl} {cscore}%` | "
f"`{r['symbolic_score']}` | {preview}… |"
)
breakdown_md = "\n".join(rows)
clause_choices = [
f"#{r['clause_index']}"
f"{(' ' + r['clause_number']) if r.get('clause_number') else ''}"
f" | {LEVEL_COLOR.get(r['risk_level_raw'],'⚪')} "
f"{r['risk_level_raw']} {int(r['risk_score']*100)}% | "
f"{r['clause_text'][:55].replace(chr(10), ' ')}…"
for r in doc["clauses"]
]
pdf_update = gr.update(visible=True, value=None)
return summary_html, top_html, breakdown_md, gr.update(choices=clause_choices, value=None), doc, pdf_update
def show_clause_explanation(choice: str, doc_state: dict):
if not choice or not doc_state:
return '<div class="cx-empty">← Select a clause above to see its full legal analysis.</div>'
try:
idx = int(choice.split("|")[0].split()[0].strip().lstrip("#")) - 1
r = doc_state["clauses"][idx]
except (ValueError, IndexError):
return '<div class="cx-empty">Could not load clause.</div>'
explanation = generate_explanation(r["clause_text"], r)
if nl_summarizer is not None:
nl_text = nl_summarizer.generate_summary(explanation, r["clause_text"])
explanation["natural_language_summary"] = nl_text
r["nl_summary"] = nl_text
else:
explanation["natural_language_summary"] = ""
lime_words = []
if local_explainer is not None and build_predict_fn_for_manager is not None:
try:
manager.ensure_loaded()
predict_fn = build_predict_fn_for_manager(manager)
lime_words = local_explainer.explain_with_lime(r["clause_text"], predict_fn)
except Exception as e:
print(f"[WARN] LIME path failed: {e}")
attn_tokens = []
if local_explainer is not None and manager.is_ready:
try:
attn_tokens = local_explainer.get_attention_map(
r["clause_text"], manager.model, manager.tokenizer,
)
except Exception as e:
print(f"[WARN] Attention path failed: {e}")
lvl = r["risk_level_raw"]
color = LEVEL_HEX.get(lvl, "#6b7280")
cpct = int(r["risk_score"] * 100)
bar = _mini_bar(cpct, lvl)
highlighted = highlight_keywords(r["clause_text"])
sym_note = ('<div class="cx-note">ℹ️ Neural model not loaded — symbolic score only.</div>'
if not manager.is_ready else "")
breakdown_html = _score_breakdown_html(r.get("score_breakdown"))
conf_badge = _confidence_badge(r.get("confidence"))
evidence_pills = _evidence_pills(r.get("evidence", {}))
evidence_block = (
f'<div class="cx-section-label">🔍 Evidence Detected</div>'
f'<div class="cx-pills">{evidence_pills}</div>'
) if evidence_pills else ""
nl_block = ""
nl_text = explanation.get("natural_language_summary", "").strip()
if nl_text:
gem_tag = ("🤖 Gemini" if (nl_summarizer and nl_summarizer.enabled)
else "📝 Template")
nl_block = (
f'<div class="cx-section-label">{gem_tag} AI Summary</div>'
f'<div class="cx-nl">{nl_text}</div>'
)
lime_block = ""
if lime_words:
lime_block = (f'<div class="cx-section-label">🧪 LIME — Key Legal Terms Driving Risk</div>'
f'{lime_html(lime_words)}')
attn_block = ""
if attn_tokens:
attn_block = (f'<div class="cx-section-label">👁️ Attention Heatmap</div>'
f'{attention_heatmap_html(attn_tokens)}')
bd_text_block = ""
if explanation.get("score_breakdown_text"):
bd_text_block = f'<div class="cx-bd-text">{explanation["score_breakdown_text"]}</div>'
rules_html = ""
for rule_data in explanation.get("rules") or []:
rid = rule_data["rule_id"]
icon = CAT_ICON.get(rule_data.get("category", ""), "⚠️")
ev_html = ""
if rule_data.get("evidence"):
ev_html = '<div class="cx-rule-row"><span class="cx-rule-k">Matched</span><span>' + \
"".join(f'<span class="cx-ev-pill-sm">“{e["phrase"]}”</span>'
for e in rule_data["evidence"]) + '</span></div>'
rules_html += f"""
<div class="cx-rule-card" style="border-left:2px solid {color}">
<div class="cx-rule-header">
<span>{icon} <strong>[{rid}]</strong> {rule_data['name']}</span>
<span class="cx-ref">{rule_data['reference']}</span>
</div>
<div class="cx-rule-row"><span class="cx-rule-k">Why flagged</span>
<span>{rule_data.get('why','—')}</span></div>
<div class="cx-rule-row"><span class="cx-rule-k">What it means</span>
<span>{rule_data.get('meaning','—')}</span></div>
{ev_html}
<div class="cx-rule-row cx-suggestion"><span class="cx-rule-k">💡 Fix</span>
<span>{rule_data.get('suggestion','—')}</span></div>
</div>"""
if not rules_html:
rules_html = '<div class="cx-empty">No specific rule violations detected.</div>'
overview = explanation.get("overview", "")
general_tip = explanation.get("general_tip", "")
tip_block = (f'<div class="cx-section-label">💡 General Guidance</div>'
f'<div class="cx-tip">{general_tip}</div>') if general_tip else ""
clause_no = r.get("clause_number")
title_note = f" · {clause_no}" if clause_no else ""
return f"""
<div class="cx-exp-wrap">
<div class="cx-exp-header" style="border-left:4px solid {color}">
<div>
<div class="cx-exp-title">Clause #{r['clause_index']}{title_note}</div>
<div class="cx-badges-row">
<span class="cx-badge" style="background:{color}22;color:{color}">{lvl} RISK</span>
{conf_badge}
<span class="cx-badge cx-badge-sm">Fused {cpct}%</span>
<span class="cx-badge cx-badge-sm">Neural {r['neural_score']}</span>
<span class="cx-badge cx-badge-sm">Symbolic {r['symbolic_score']}</span>
</div>
</div>
<div style="width:180px;padding-top:8px">{bar}</div>
</div>
{sym_note}
{nl_block}
<div class="cx-section-label">📋 Overview</div>
<div class="cx-overview">{overview}</div>
<div class="cx-section-label">🧮 Score Breakdown</div>
{breakdown_html}
{bd_text_block}
{evidence_block}
{lime_block}
{attn_block}
<div class="cx-section-label">🔦 Clause Text</div>
<div class="cx-clause-text">{highlighted}</div>
<div class="cx-section-label">⚖️ Rule Analysis</div>
{rules_html}
{tip_block}
</div>"""
def build_pdf_report(doc_state: dict):
if not doc_state:
return gr.update(visible=False, value=None)
if generate_report is None:
return gr.update(visible=True, value=None)
try:
tmp = tempfile.NamedTemporaryFile(prefix="clausexplain_", suffix=".pdf",
delete=False)
tmp.close()
out = generate_report(doc_state, tmp.name)
return gr.update(visible=True, value=out)
except Exception as e:
print(f"[ERROR] PDF report generation failed: {e}")
return gr.update(visible=True, value=None)
EXAMPLES = [
("⚡ High Risk", """1. Liability Cap
The total liability of either party shall not exceed Rs. 50,000 under any circumstances,
including gross negligence or wilful misconduct of either party.
2. Non-Compete
Employee shall not compete with the company in any capacity for 3 years following
termination of this agreement, within the territory of India.
3. Indemnity
The Service Provider shall indemnify and hold harmless the Client against any and all
claims, damages, losses, and expenses arising out of or related to this agreement."""),
("🟡 Medium Risk", """1. Auto-Renewal
This agreement shall automatically renew for successive one-year terms.
2. Arbitration
Any dispute arising out of this agreement shall be referred to arbitration with
the seat of arbitration in Singapore.
3. Pricing
The Company may modify the prices and fees charged under this agreement at
its sole discretion to modify the terms upon written notice."""),
("🟢 Low Risk", """1. Renewal
This agreement renews automatically every year unless either party provides
30 days written notice before the renewal date.
2. Governing Law
This agreement is governed by the laws of India."""),
("🧪 Benign (M&A-style)", """Compensation paid hereunder shall be exclusive of the Company's
contributions to statutory benefits. Payment of the closing bonus is
contingent on the occurrence of the closing of the merger transaction
and continued employment through such date."""),
]
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,400&family=DM+Mono:wght@400;500&display=swap');
* { box-sizing: border-box; }
body, .gradio-container { background:#080d1a !important; font-family:'DM Sans',sans-serif !important; color:#e2e8f0 !important; }
footer { display:none !important; }
.gradio-container { max-width:1080px !important; margin:0 auto !important; }
.cx-hero { text-align:center; padding:52px 24px 36px; background:linear-gradient(135deg,#0f172a 0%,#1a1040 60%,#0f172a 100%); border-radius:16px; margin-bottom:8px; position:relative; overflow:hidden; }
.cx-hero::before { content:''; position:absolute; inset:0; background:radial-gradient(ellipse 70% 60% at 50% -10%,#6366f135 0%,transparent 70%); pointer-events:none; }
.cx-hero-icon { font-size:44px; margin-bottom:14px; }
.cx-hero-title { font-size:38px; font-weight:700; letter-spacing:-.025em; background:linear-gradient(135deg,#f1f5f9 20%,#a5b4fc 80%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; margin:0 0 10px; line-height:1.1; }
.cx-hero-sub { font-size:15px; color:#94a3b8; margin:0 0 22px; font-weight:400; }
.cx-badges { display:flex; gap:8px; flex-wrap:wrap; justify-content:center; }
.cx-badge-hero { font-size:11px; font-weight:600; letter-spacing:.07em; text-transform:uppercase; padding:5px 12px; border-radius:20px; border:1px solid #2d3a55; background:#131c30; color:#8b9fc7; }
.cx-model-notice { background:#111827; border:1px solid #1e293b; border-radius:10px; padding:11px 16px; font-size:13px; color:#94a3b8; display:flex; align-items:center; gap:10px; margin-bottom:4px; }
.cx-model-notice strong { color:#a5b4fc; }
.cx-card { background:#111827; border:1px solid #1e293b; border-radius:12px; padding:16px 20px; }
.cx-summary-grid { display:grid; grid-template-columns:180px 1fr 1fr 1fr 1fr; gap:12px; align-items:stretch; margin:4px 0 8px; }
@media(max-width:720px){ .cx-summary-grid { grid-template-columns:1fr 1fr; } }
.cx-gauge-card { display:flex; align-items:center; justify-content:center; padding:20px; }
.cx-stat-card { display:flex; flex-direction:column; justify-content:center; gap:6px; }
.cx-stat-label { font-size:11px; font-weight:600; letter-spacing:.06em; text-transform:uppercase; color:#4b5563; }
.cx-stat-val { font-size:30px; font-weight:700; font-family:'DM Mono',monospace; line-height:1; }
.cx-note { background:#1e293b; border-radius:8px; padding:10px 14px; font-size:13px; color:#94a3b8; margin:6px 0; }
.cx-warn { border-left:3px solid #f59e0b; color:#fcd34d !important; }
.cx-section-title, .cx-section-label { font-size:11px; font-weight:700; letter-spacing:.08em; text-transform:uppercase; color:#4b5563; margin:20px 0 10px; }
.cx-top-grid { display:flex; flex-direction:column; gap:12px; }
.cx-clause-card { background:#111827; border:1px solid #1e293b; border-radius:12px; padding:16px 18px; transition:border-color .15s; }
.cx-clause-card:hover { border-color:#2d3748; }
.cx-clause-header { display:flex; align-items:center; gap:8px; margin-bottom:10px; flex-wrap:wrap; }
.cx-clause-num { font-family:'DM Mono',monospace; font-size:12px; color:#4b5563; min-width:28px; }
.cx-badge { font-size:10px; font-weight:700; letter-spacing:.07em; text-transform:uppercase; padding:3px 9px; border-radius:20px; }
.cx-badge-sm { background:#1e293b !important; color:#64748b !important; }
.cx-score-label { font-family:'DM Mono',monospace; font-size:14px; font-weight:600; color:#e2e8f0; margin-left:auto; }
.cx-bar-wrap { margin-bottom:12px; }
.cx-clause-preview { font-size:13px; color:#94a3b8; line-height:1.65; margin-bottom:12px; }
.cx-clause-preview strong { color:#fca5a5; background:#7f1d1d28; border-radius:3px; padding:0 2px; }
.cx-pills { display:flex; flex-wrap:wrap; gap:6px; }
.cx-pill { font-size:11px; font-weight:500; padding:3px 8px; border-radius:6px; border:1px solid; letter-spacing:.02em; }
.cx-pill-none { font-size:12px; color:#374151; }
.cx-ev-pill { font-size:11px; padding:3px 8px; border-radius:6px; background:#1e293b; color:#a5b4fc; border:1px solid #312e81; font-family:'DM Mono',monospace; }
.cx-ev-pill-sm { font-size:11px; padding:2px 7px; border-radius:5px; background:#1e293b; color:#a5b4fc; border:1px solid #312e81; font-family:'DM Mono',monospace; margin-right:4px; display:inline-block; }
.cx-breakdown { background:#0c1525; border:1px solid #1e293b; border-radius:10px; padding:14px 16px; font-family:'DM Mono',monospace; font-size:13px; color:#cbd5e1; }
.cx-bd-row { display:grid; grid-template-columns:80px 80px 1fr; align-items:center; padding:4px 0; }
.cx-bd-k { color:#6b7280; font-size:11px; text-transform:uppercase; letter-spacing:.06em; }
.cx-bd-v { color:#e2e8f0; font-weight:600; }
.cx-bd-w { color:#94a3b8; }
.cx-bd-sep { border:none; border-top:1px solid #1e293b; margin:8px 0; }
.cx-bd-formula { color:#a5b4fc; font-size:13px; padding:4px 0 8px; }
.cx-bd-final { display:grid; grid-template-columns:80px 1fr; padding-top:4px; }
.cx-bd-final-v { color:#34d399; font-weight:700; font-size:16px; }
.cx-bd-text { font-family:'DM Mono',monospace; font-size:12px; color:#94a3b8; padding:6px 14px; }
.cx-divider { border:none; border-top:1px solid #1a2332; margin:24px 0; }
.cx-empty { color:#374151; font-size:14px; padding:28px 0; text-align:center; }
.cx-exp-wrap { display:flex; flex-direction:column; gap:14px; }
.cx-exp-header { background:#111827; border-radius:12px; padding:16px 20px; display:flex; align-items:flex-start; justify-content:space-between; gap:16px; }
.cx-exp-title { font-size:17px; font-weight:700; margin-bottom:8px; }
.cx-badges-row { display:flex; gap:6px; flex-wrap:wrap; }
.cx-overview, .cx-nl { background:#111827; border-radius:10px; padding:14px 16px; font-size:14px; color:#cbd5e1; line-height:1.75; }
.cx-nl { border-left:3px solid #8b5cf6; }
.cx-clause-text { background:#0c1525; border:1px solid #1e293b; border-radius:10px; padding:16px; font-size:13px; line-height:1.9; color:#94a3b8; font-family:'DM Mono',monospace; white-space:pre-wrap; }
.cx-clause-text strong { color:#fca5a5; background:#7f1d1d2a; border-radius:3px; padding:1px 3px; }
.cx-rule-card { background:#111827; border-radius:10px; padding:14px 16px; display:flex; flex-direction:column; gap:8px; margin-bottom:8px; }
.cx-rule-header { display:flex; align-items:flex-start; justify-content:space-between; gap:12px; font-size:14px; font-weight:600; }
.cx-ref { font-size:11px; color:#6366f1; font-family:'DM Mono',monospace; white-space:nowrap; padding-top:2px; flex-shrink:0; }
.cx-rule-row { display:grid; grid-template-columns:100px 1fr; gap:8px; font-size:13px; color:#94a3b8; align-items:baseline; }
.cx-rule-k { font-size:10px; font-weight:700; letter-spacing:.05em; text-transform:uppercase; color:#374151; }
.cx-suggestion span:last-child { color:#6ee7b7; }
.cx-tip { background:#0a1f16; border:1px solid #064e3b40; border-radius:10px; padding:14px 16px; font-size:13px; color:#6ee7b7; line-height:1.75; }
.cx-attn-wrap, .cx-lime-wrap { background:#0c1525; border:1px solid #1e293b; border-radius:10px; padding:14px 16px; }
.cx-attn-title, .cx-lime-title { font-size:11px; font-weight:700; letter-spacing:.07em; text-transform:uppercase; color:#6b7280; margin-bottom:10px; }
.cx-attn-grid { display:flex; flex-wrap:wrap; gap:4px; }
.cx-attn-chip { display:inline-flex; align-items:center; gap:4px; padding:3px 7px; border-radius:5px; font-family:'DM Mono',monospace; font-size:12px; color:#e2e8f0; }
.cx-attn-w { font-size:9px; opacity:.6; }
.cx-attn-legend, .cx-lime-legend { font-size:11px; color:#4b5563; margin-top:10px; }
.cx-lime-list { display:flex; flex-direction:column; gap:6px; }
.cx-lime-row { display:grid; grid-template-columns:120px 1fr 80px; align-items:center; gap:10px; }
.cx-lime-word { font-family:'DM Mono',monospace; font-size:12px; color:#cbd5e1; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
.cx-lime-bar-wrap { background:#1e293b; border-radius:4px; height:6px; }
.cx-lime-bar { height:6px; border-radius:4px; }
.cx-lime-w { font-family:'DM Mono',monospace; font-size:11px; text-align:right; }
.gr-button, button { border-radius:8px !important; font-weight:600 !important; }
button.primary { background:linear-gradient(135deg,#6366f1,#8b5cf6) !important; border:none !important; color:#fff !important; letter-spacing:.02em !important; transition:opacity .15s !important; }
button.secondary { background:#1e293b !important; border:1px solid #334155 !important; color:#e2e8f0 !important; }
button:hover { opacity:.88 !important; }
.gr-box, .gr-form { background:#111827 !important; border-color:#1e293b !important; border-radius:12px !important; }
textarea, input[type=text] { background:#0c1525 !important; border:1px solid #1e293b !important; color:#e2e8f0 !important; border-radius:8px !important; font-family:'DM Sans',sans-serif !important; }
label > span { color:#64748b !important; font-size:13px !important; }
.gr-file { background:#0c1525 !important; border:1px dashed #2d3748 !important; border-radius:10px !important; }
select, .gr-dropdown { background:#0c1525 !important; border-color:#1e293b !important; color:#e2e8f0 !important; border-radius:8px !important; }
.gr-accordion > .label-wrap { background:#111827 !important; border-color:#1e293b !important; border-radius:10px !important; color:#94a3b8 !important; }
"""
def build_ui():
with gr.Blocks(
title="ClauseXplain — AI Legal Risk Dashboard",
theme=gr.themes.Base(
primary_hue=gr.themes.colors.indigo,
neutral_hue=gr.themes.colors.slate,
font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
),
css=CUSTOM_CSS,
) as demo:
doc_state = gr.State(value=None)
gr.HTML("""
<div class="cx-hero">
<div class="cx-hero-icon">⚖️</div>
<h1 class="cx-hero-title">ClauseXplain</h1>
<p class="cx-hero-sub">International contract neural backbone, localised via Indian neuro-symbolic legal reasoning</p>
<div class="cx-badges">
<span class="cx-badge-hero">ICA 1872</span>
<span class="cx-badge-hero">DPDPA 2023</span>
<span class="cx-badge-hero">IT Act 2000</span>
<span class="cx-badge-hero">CPA 2019</span>
<span class="cx-badge-hero">Arbitration Act 1996</span>
<span class="cx-badge-hero">Gemini · LIME · Attention</span>
</div>
</div>
""")
gr.HTML("""
<div class="cx-model-notice">
⏳ &nbsp;The neural model (~2 GB) loads on your <strong>first analysis request</strong> —
expect 60–90 s. Per-clause LIME + attention run lazily when you inspect a clause (~15–25 s).
</div>
""")
with gr.Row(equal_height=True):
with gr.Column():
gr.HTML('<div style="font-size:11px;font-weight:700;letter-spacing:.08em;'
'text-transform:uppercase;color:#6366f1;margin-bottom:8px">📂 Upload PDF</div>')
pdf_input = gr.File(label="Contract PDF", file_types=[".pdf"], type="filepath")
pdf_btn = gr.Button("Analyse PDF →", variant="primary")
with gr.Column():
gr.HTML('<div style="font-size:11px;font-weight:700;letter-spacing:.08em;'
'text-transform:uppercase;color:#6366f1;margin-bottom:8px">✏️ Paste Text</div>')
text_input = gr.Textbox(label="", placeholder="Paste one or more contract clauses here…", lines=6)
text_btn = gr.Button("Analyse Text →", variant="secondary")
gr.HTML('<div style="font-size:11px;font-weight:600;letter-spacing:.07em;'
'text-transform:uppercase;color:#374151;margin:16px 0 8px">Try an example</div>')
with gr.Row():
for label, content in EXAMPLES:
gr.Button(label, size="sm").click(fn=lambda c=content: c, outputs=text_input)
gr.HTML('<hr class="cx-divider">')
summary_out = gr.HTML('<div class="cx-empty">Upload a PDF or paste contract text to begin analysis.</div>')
gr.HTML('<hr class="cx-divider">')
top_risks_out = gr.HTML("")
with gr.Accordion("📄 Full Clause Breakdown", open=False):
breakdown_out = gr.Markdown("")
with gr.Row():
pdf_dl_btn = gr.Button("📥 Download PDF Report", variant="primary")
pdf_file_out = gr.File(label="Compliance Report", visible=False, interactive=False)
gr.HTML('<hr class="cx-divider">')
gr.HTML('<div style="font-size:11px;font-weight:700;letter-spacing:.08em;'
'text-transform:uppercase;color:#4b5563;margin-bottom:12px">🔎 Clause Explorer</div>')
clause_selector = gr.Dropdown(label="Select a clause to inspect", choices=[], interactive=True)
explanation_out = gr.HTML('<div class="cx-empty">← Select a clause above to see its full legal analysis.</div>')
gr.HTML("""
<hr class="cx-divider">
<div style="text-align:center;padding:12px 0 4px;color:#1e293b;font-size:12px;letter-spacing:.04em">
Built for Indian Contract Intelligence &nbsp;·&nbsp;
Neuro-Symbolic AI &nbsp;·&nbsp;
<em>For informational purposes only — not legal advice. Consult a qualified lawyer.</em>
</div>
""")
shared = [summary_out, top_risks_out, breakdown_out, clause_selector, doc_state, pdf_file_out]
pdf_btn.click(fn=analyze_pdf, inputs=[pdf_input], outputs=shared)
text_btn.click(fn=analyze_text, inputs=[text_input], outputs=shared)
clause_selector.change(fn=show_clause_explanation,
inputs=[clause_selector, doc_state],
outputs=[explanation_out])
pdf_dl_btn.click(fn=build_pdf_report, inputs=[doc_state], outputs=[pdf_file_out])
return demo
if __name__ == "__main__":
demo = build_ui()
demo.launch(server_name="0.0.0.0", server_port=7860)