Spaces:

riyasuryawanshi746
/

Major_Project

Sleeping

App Files Files Community

riyasuryawanshi746 commited on 14 days ago

Commit

af910e9

verified ·

1 Parent(s): 92625e7

Explanability and Symbolic part fixed 12th may

Browse files

Files changed (6) hide show

app.py +40 -68
explanation.py +13 -53
feature_extractor.py +125 -107
inference.py +40 -66
local_interpreters.py +86 -48
pdf_utils.py +159 -128

app.py CHANGED Viewed

@@ -1,7 +1,10 @@
 # app.py
-# ClauseXplain v5.2 — full explainability stack
-# Integrates: feature_extractor, nl_summary, local_interpreters,
-# attention_visualization, report
 from __future__ import annotations
 import os
@@ -18,10 +21,10 @@ from transformers import LongformerTokenizer, LongformerModel
 from sklearn.preprocessing import MultiLabelBinarizer
 from huggingface_hub import hf_hub_download
-# ── New modules ───────────────────────────────────────────────────────────────
 from feature_extractor import ClauseFeatureExtractor
 from explanation       import generate_explanation
-from utils             import highlight_keywords  # (others unused)
 # ── Optional / fail-soft integrations ─────────────────────────────────────────
 try:
@@ -52,10 +55,8 @@ except Exception as _e:
     print(f"[WARN] report disabled: {_e}")
     generate_report = None
-# ── Device — always CPU on HF free tier ───────────────────────────────────────
 DEVICE = torch.device("cpu")
-# ── Label sets (unchanged) ────────────────────────────────────────────────────
 CLAUSE_CLASSES = [
     "Cap On Liability", "Change Of Control", "Covenant Not To Sue",
     "Exclusivity", "Governing Law", "IP Ownership Assignment",
@@ -76,7 +77,7 @@ CLAUSE_CLASSES = [
 RISK_CLASSES = ["ambiguity", "enforceability", "financial", "ip", "structural"]
 # ─────────────────────────────────────────────────────────────────────────────
-# Symbolic rules — original 12 + 5 newly added
 # ─────────────────────────────────────────────────────────────────────────────
 SYMBOLIC_RULES = [
     {"rule_id": "ICA_001", "name": "Unconscionable Liability Cap",
@@ -91,21 +92,28 @@ SYMBOLIC_RULES = [
     {"rule_id": "ICA_004", "name": "Penalty Clause Exceeds Actual Damage",
      "reference": "Indian Contract Act 1872, S.74", "penalty": 0.40, "category": "financial",
      "condition": lambda f: f.get("has_liquidated_damages") and f.get("damages_exceed_loss")},
-    {"rule_id": "ICA_005", "name": "Wagering / Contingency Clause",
      "reference": "Indian Contract Act 1872, S.30", "penalty": 0.70, "category": "enforceability",
      "condition": lambda f: f.get("is_wagering_clause")},
     {"rule_id": "ICA_006", "name": "Restraint of Legal Proceedings",
      "reference": "Indian Contract Act 1872, S.28", "penalty": 0.60, "category": "enforceability",
      "condition": lambda f: f.get("restrains_legal_proceedings")},
     {"rule_id": "ICA_007", "name": "Uncapped Indemnity Obligation",
      "reference": "Indian Contract Act 1872, S.124", "penalty": 0.50, "category": "financial",
-     "condition": lambda f: f.get("has_indemnity_clause") and not f.get("indemnity_capped")},
     {"rule_id": "ICA_008", "name": "Auto-Renewal Without Opt-Out Window",
      "reference": "Indian Contract Act 1872 + CPA 2019", "penalty": 0.35, "category": "enforceability",
      "condition": lambda f: f.get("has_auto_renewal") and not f.get("has_opt_out_window")},
     {"rule_id": "ICA_009", "name": "Arbitration in Distant Venue",
      "reference": "Arbitration and Conciliation Act 1996, S.20", "penalty": 0.40, "category": "enforceability",
      "condition": lambda f: f.get("has_arbitration") and f.get("arbitration_distant_venue")},
     {"rule_id": "ICA_010", "name": "Indefinite Exclusivity",
      "reference": "Indian Contract Act 1872, S.27", "penalty": 0.50, "category": "enforceability",
      "condition": lambda f: f.get("has_exclusivity") and not f.get("exclusivity_term_defined")},
@@ -134,7 +142,7 @@ SYMBOLIC_RULES = [
 # ─────────────────────────────────────────────────────────────────────────────
-# Model definition (unchanged)
 # ─────────────────────────────────────────────────────────────────────────────
 class ClauseXplainV5(nn.Module):
     def __init__(self, num_clause_labels: int, num_risk_labels: int):
@@ -211,7 +219,6 @@ class ModelManager:
                 repo_id="riyasuryawanshi746/clauseXplain",
                 filename="clausexplain_v5_best.pt",
             )
-            print(f"[INFO] Checkpoint at: {ckpt_path}")
             checkpoint = torch.load(
                 ckpt_path,
                 map_location=torch.device("cpu"),
@@ -301,7 +308,7 @@ class ModelManager:
             "top_risk_cats":   top_risks,
             "triggered_rules": triggered_clean,
             "features":        {k: v for k, v in features.items() if v},
-            "evidence":        evidence,            # nested dict (kept for explanation engine)
             "score_breakdown": fusion["breakdown"],
             "confidence":      confidence,
         }
@@ -333,9 +340,8 @@ class ModelManager:
         scores  = [r["risk_score"] for r in results]
         overall = round(0.70 * max(scores) + 0.30 * (sum(scores) / len(scores)), 3)
-        if   overall <= 0.33: level = "Low"
-        elif overall <= 0.66: level = "Medium"
-        else:                 level = "High"
         return {
             "overall_risk":  overall,
             "overall_level": level,
@@ -348,7 +354,7 @@ class ModelManager:
 manager = ModelManager()
 # ═══════════════════════════════════════════════════════════════════════════════
-# UI helpers
 # ═══════════════════════════════════════════════════════════════════════════════
 LEVEL_COLOR = {"Low": "🟢", "Medium": "🟡", "High": "🔴"}
 LEVEL_HEX   = {"Low": "#10b981", "Medium": "#f59e0b", "High": "#ef4444"}
@@ -412,8 +418,6 @@ def _score_breakdown_html(breakdown) -> str:
     if not breakdown:
         return ""
     w = breakdown["weights"]
-    floor_note = ('<div class="cx-bd-floor">⚓ Floor 0.30 applied — symbolic rules fired</div>'
-                  if breakdown.get("floor_applied") else "")
     return f"""
 <div class="cx-breakdown">
   <div class="cx-bd-row">
@@ -432,12 +436,11 @@ def _score_breakdown_html(breakdown) -> str:
     <span class="cx-bd-k">Final</span>
     <span class="cx-bd-final-v">{breakdown['final']:.3f}</span>
   </div>
-  {floor_note}
 </div>"""
 # ═══════════════════════════════════════════════════════════════════════════════
-# Analysis flow
 # ═══════════════════════════════════════════════════════════════════════════════
 def _run_analysis(text: str):
     if not text or len(text.strip()) < 30:
@@ -505,7 +508,6 @@ def _build_outputs(text: str):
 </div>
 {model_note}"""
-    # Top cards
     top_parts = ['<div class="cx-section-title">🔥 Top Risk Clauses</div>',
                  '<div class="cx-top-grid">']
     for r in doc["top_risks"]:
@@ -537,7 +539,6 @@ def _build_outputs(text: str):
     top_parts.append("</div>")
     top_html = "\n".join(top_parts)
-    # Markdown breakdown table
     rows = [
         "## 📄 All Clauses\n",
         "| # | Marker | Level | Score | Confidence | Symbolic | Preview |",
@@ -565,14 +566,10 @@ def _build_outputs(text: str):
         f"{r['clause_text'][:55].replace(chr(10), ' ')}…"
         for r in doc["clauses"]
     ]
-    # PDF download button: visible only after a successful analysis
     pdf_update = gr.update(visible=True, value=None)
     return summary_html, top_html, breakdown_md, gr.update(choices=clause_choices, value=None), doc, pdf_update
-# ────────────────��────────────────────────────────────────────────────────────
-# Clause explanation panel — lazy-runs Gemini + LIME + attention here
-# ─────────────────────────────────────────────────────────────────────────────
 def show_clause_explanation(choice: str, doc_state: dict):
     if not choice or not doc_state:
         return '<div class="cx-empty">← Select a clause above to see its full legal analysis.</div>'
@@ -584,27 +581,22 @@ def show_clause_explanation(choice: str, doc_state: dict):
     explanation = generate_explanation(r["clause_text"], r)
-    # ── Gemini summary (cached) ────────────────────────────────────────────
     if nl_summarizer is not None:
         nl_text = nl_summarizer.generate_summary(explanation, r["clause_text"])
         explanation["natural_language_summary"] = nl_text
-        # Persist so the PDF report can include it
         r["nl_summary"] = nl_text
     else:
         explanation["natural_language_summary"] = ""
-    # ── LIME (lazy, bounded) ───────────────────────────────────────────────
     lime_words = []
     if local_explainer is not None and build_predict_fn_for_manager is not None:
         try:
             manager.ensure_loaded()
             predict_fn = build_predict_fn_for_manager(manager)
-            lime_words = local_explainer.explain_with_lime(r["clause_text"], predict_fn,
-                                                            num_features=10)
         except Exception as e:
             print(f"[WARN] LIME path failed: {e}")
-    # ── Attention map (lazy, bounded) ──────────────────────────────────────
     attn_tokens = []
     if local_explainer is not None and manager.is_ready:
         try:
@@ -614,7 +606,6 @@ def show_clause_explanation(choice: str, doc_state: dict):
         except Exception as e:
             print(f"[WARN] Attention path failed: {e}")
-    # ── Render ─────────────────────────────────────────────────────────────
     lvl   = r["risk_level_raw"]
     color = LEVEL_HEX.get(lvl, "#6b7280")
     cpct  = int(r["risk_score"] * 100)
@@ -631,7 +622,6 @@ def show_clause_explanation(choice: str, doc_state: dict):
         f'<div class="cx-pills">{evidence_pills}</div>'
     ) if evidence_pills else ""
-    # NL summary block
     nl_block = ""
     nl_text  = explanation.get("natural_language_summary", "").strip()
     if nl_text:
@@ -642,22 +632,19 @@ def show_clause_explanation(choice: str, doc_state: dict):
             f'<div class="cx-nl">{nl_text}</div>'
         )
-    # LIME + attention blocks
     lime_block = ""
     if lime_words:
-        lime_block = (f'<div class="cx-section-label">🧪 LIME — Token Contributions</div>'
                       f'{lime_html(lime_words)}')
     attn_block = ""
     if attn_tokens:
         attn_block = (f'<div class="cx-section-label">👁️ Attention Heatmap</div>'
                       f'{attention_heatmap_html(attn_tokens)}')
-    # Score-breakdown plaintext (Riya's "Final Score = ..." string)
     bd_text_block = ""
     if explanation.get("score_breakdown_text"):
         bd_text_block = f'<div class="cx-bd-text">{explanation["score_breakdown_text"]}</div>'
-    # Per-rule cards
     rules_html = ""
     for rule_data in explanation.get("rules") or []:
         rid   = rule_data["rule_id"]
@@ -725,9 +712,6 @@ def show_clause_explanation(choice: str, doc_state: dict):
 </div>"""
-# ─────────────────────────────────────────────────────────────────────────────
-# PDF report download
-# ─────────────────────────────────────────────────────────────────────────────
 def build_pdf_report(doc_state: dict):
     if not doc_state:
         return gr.update(visible=False, value=None)
@@ -744,7 +728,6 @@ def build_pdf_report(doc_state: dict):
         return gr.update(visible=True, value=None)
-# ── Examples ──────────────────────────────────────────────────────────────────
 EXAMPLES = [
     ("⚡ High Risk", """1. Liability Cap
 The total liability of either party shall not exceed Rs. 50,000 under any circumstances,
@@ -752,10 +735,7 @@ including gross negligence or wilful misconduct of either party.
 2. Non-Compete
 Employee shall not compete with the company in any capacity for 3 years following
 termination of this agreement, within the territory of India.
-3. Data Processing
-The vendor shall collect and process customer personal data as required to fulfil
-the services described in Schedule A of this agreement.
-4. Indemnity
 The Service Provider shall indemnify and hold harmless the Client against any and all
 claims, damages, losses, and expenses arising out of or related to this agreement."""),
@@ -766,40 +746,35 @@ Any dispute arising out of this agreement shall be referred to arbitration with
 the seat of arbitration in Singapore.
 3. Pricing
 The Company may modify the prices and fees charged under this agreement at
-its sole discretion upon written notice to the Customer."""),
     ("🟢 Low Risk", """1. Renewal
 This agreement renews automatically every year unless either party provides
 30 days written notice before the renewal date.
 2. Governing Law
 This agreement is governed by the laws of India."""),
 ]
-# ── CSS (additive over v5.1) ──────────────────────────────────────────────────
 CUSTOM_CSS = """
 @import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,400&family=DM+Mono:wght@400;500&display=swap');
 * { box-sizing: border-box; }
 body, .gradio-container { background:#080d1a !important; font-family:'DM Sans',sans-serif !important; color:#e2e8f0 !important; }
 footer { display:none !important; }
 .gradio-container { max-width:1080px !important; margin:0 auto !important; }
-.cx-hero { text-align:center; padding:52px 24px 36px;
-  background:linear-gradient(135deg,#0f172a 0%,#1a1040 60%,#0f172a 100%);
-  border-radius:16px; margin-bottom:8px; position:relative; overflow:hidden; }
-.cx-hero::before { content:''; position:absolute; inset:0;
-  background:radial-gradient(ellipse 70% 60% at 50% -10%,#6366f135 0%,transparent 70%);
-  pointer-events:none; }
 .cx-hero-icon { font-size:44px; margin-bottom:14px; }
-.cx-hero-title { font-size:38px; font-weight:700; letter-spacing:-.025em;
-  background:linear-gradient(135deg,#f1f5f9 20%,#a5b4fc 80%);
-  -webkit-background-clip:text; -webkit-text-fill-color:transparent;
-  margin:0 0 10px; line-height:1.1; }
 .cx-hero-sub { font-size:15px; color:#94a3b8; margin:0 0 22px; font-weight:400; }
 .cx-badges { display:flex; gap:8px; flex-wrap:wrap; justify-content:center; }
-.cx-badge-hero { font-size:11px; font-weight:600; letter-spacing:.07em; text-transform:uppercase;
-  padding:5px 12px; border-radius:20px; border:1px solid #2d3a55; background:#131c30; color:#8b9fc7; }
-.cx-model-notice { background:#111827; border:1px solid #1e293b; border-radius:10px;
-  padding:11px 16px; font-size:13px; color:#94a3b8; display:flex; align-items:center; gap:10px; margin-bottom:4px; }
 .cx-model-notice strong { color:#a5b4fc; }
 .cx-card { background:#111827; border:1px solid #1e293b; border-radius:12px; padding:16px 20px; }
 .cx-summary-grid { display:grid; grid-template-columns:180px 1fr 1fr 1fr 1fr; gap:12px; align-items:stretch; margin:4px 0 8px; }
@@ -836,7 +811,6 @@ footer { display:none !important; }
 .cx-bd-formula { color:#a5b4fc; font-size:13px; padding:4px 0 8px; }
 .cx-bd-final { display:grid; grid-template-columns:80px 1fr; padding-top:4px; }
 .cx-bd-final-v { color:#34d399; font-weight:700; font-size:16px; }
-.cx-bd-floor { margin-top:8px; font-size:11px; color:#fbbf24; background:#422006; padding:6px 10px; border-radius:6px; font-family:'DM Sans',sans-serif; }
 .cx-bd-text { font-family:'DM Mono',monospace; font-size:12px; color:#94a3b8; padding:6px 14px; }
 .cx-divider { border:none; border-top:1px solid #1a2332; margin:24px 0; }
 .cx-empty { color:#374151; font-size:14px; padding:28px 0; text-align:center; }
@@ -880,7 +854,6 @@ select, .gr-dropdown { background:#0c1525 !important; border-color:#1e293b !impo
 """
-# ── Build UI ───────────────────────────────────────────────────────────────────
 def build_ui():
     with gr.Blocks(
         title="ClauseXplain — AI Legal Risk Dashboard",
@@ -898,7 +871,7 @@ def build_ui():
 <div class="cx-hero">
   <div class="cx-hero-icon">⚖️</div>
   <h1 class="cx-hero-title">ClauseXplain</h1>
-  <p class="cx-hero-sub">AI Legal Risk Analyzer for Indian Contracts</p>
   <div class="cx-badges">
     <span class="cx-badge-hero">ICA 1872</span>
     <span class="cx-badge-hero">DPDPA 2023</span>
@@ -913,7 +886,7 @@ def build_ui():
         gr.HTML("""
 <div class="cx-model-notice">
   ⏳ &nbsp;The neural model (~2 GB) loads on your <strong>first analysis request</strong> —
-  expect 60–90 s. LIME + attention run lazily when you open a clause (~15–25 s).
 </div>
 """)
@@ -944,7 +917,6 @@ def build_ui():
         with gr.Accordion("📄 Full Clause Breakdown", open=False):
             breakdown_out = gr.Markdown("")
-        # ── PDF download ──────────────────────────────────────────────────────
         with gr.Row():
             pdf_dl_btn = gr.Button("📥 Download PDF Report", variant="primary")
             pdf_file_out = gr.File(label="Compliance Report", visible=False, interactive=False)

 # app.py
+# ClauseXplain v5.3 — hardening pass
+# Changes vs v5.2:
+#  • ICA_007 (uncapped indemnity) now requires has_uncapped_signal — no more
+#    auto-firing on every "indemnify" mention
+#  • analyze_document uses level_from_score() from inference.py (single source
+#    of truth for the new 0.50 / 0.80 risk-level cutoffs)
 from __future__ import annotations
 import os
 from sklearn.preprocessing import MultiLabelBinarizer
 from huggingface_hub import hf_hub_download
 from feature_extractor import ClauseFeatureExtractor
 from explanation       import generate_explanation
+from inference         import level_from_score   # v5.3: single source of truth
+from utils             import highlight_keywords
 # ── Optional / fail-soft integrations ─────────────────────────────────────────
 try:
     print(f"[WARN] report disabled: {_e}")
     generate_report = None
 DEVICE = torch.device("cpu")
 CLAUSE_CLASSES = [
     "Cap On Liability", "Change Of Control", "Covenant Not To Sue",
     "Exclusivity", "Governing Law", "IP Ownership Assignment",
 RISK_CLASSES = ["ambiguity", "enforceability", "financial", "ip", "structural"]
 # ─────────────────────────────────────────────────────────────────────────────
+# Symbolic rules — v5.3 tightened
 # ─────────────────────────────────────────────────────────────────────────────
 SYMBOLIC_RULES = [
     {"rule_id": "ICA_001", "name": "Unconscionable Liability Cap",
     {"rule_id": "ICA_004", "name": "Penalty Clause Exceeds Actual Damage",
      "reference": "Indian Contract Act 1872, S.74", "penalty": 0.40, "category": "financial",
      "condition": lambda f: f.get("has_liquidated_damages") and f.get("damages_exceed_loss")},
+    # ICA_005: only fires on explicit gambling vocab — no more "contingent on closing"
+    {"rule_id": "ICA_005", "name": "Wagering / Gambling Agreement",
      "reference": "Indian Contract Act 1872, S.30", "penalty": 0.70, "category": "enforceability",
      "condition": lambda f: f.get("is_wagering_clause")},
     {"rule_id": "ICA_006", "name": "Restraint of Legal Proceedings",
      "reference": "Indian Contract Act 1872, S.28", "penalty": 0.60, "category": "enforceability",
      "condition": lambda f: f.get("restrains_legal_proceedings")},
+    # ICA_007 TIGHTENED: indemnity + explicit uncapped signal + no cap
     {"rule_id": "ICA_007", "name": "Uncapped Indemnity Obligation",
      "reference": "Indian Contract Act 1872, S.124", "penalty": 0.50, "category": "financial",
+     "condition": lambda f: (
+         f.get("has_indemnity_clause")
+         and f.get("has_uncapped_signal")
+         and not f.get("indemnity_capped")
+     )},
     {"rule_id": "ICA_008", "name": "Auto-Renewal Without Opt-Out Window",
      "reference": "Indian Contract Act 1872 + CPA 2019", "penalty": 0.35, "category": "enforceability",
      "condition": lambda f: f.get("has_auto_renewal") and not f.get("has_opt_out_window")},
     {"rule_id": "ICA_009", "name": "Arbitration in Distant Venue",
      "reference": "Arbitration and Conciliation Act 1996, S.20", "penalty": 0.40, "category": "enforceability",
      "condition": lambda f: f.get("has_arbitration") and f.get("arbitration_distant_venue")},
+    # ICA_010 narrowed via tightened has_exclusivity patterns in feature_extractor
     {"rule_id": "ICA_010", "name": "Indefinite Exclusivity",
      "reference": "Indian Contract Act 1872, S.27", "penalty": 0.50, "category": "enforceability",
      "condition": lambda f: f.get("has_exclusivity") and not f.get("exclusivity_term_defined")},
 # ─────────────────────────────────────────────────────────────────────────────
+# Model (unchanged)
 # ─────────────────────────────────────────────────────────────────────────────
 class ClauseXplainV5(nn.Module):
     def __init__(self, num_clause_labels: int, num_risk_labels: int):
                 repo_id="riyasuryawanshi746/clauseXplain",
                 filename="clausexplain_v5_best.pt",
             )
             checkpoint = torch.load(
                 ckpt_path,
                 map_location=torch.device("cpu"),
             "top_risk_cats":   top_risks,
             "triggered_rules": triggered_clean,
             "features":        {k: v for k, v in features.items() if v},
+            "evidence":        evidence,
             "score_breakdown": fusion["breakdown"],
             "confidence":      confidence,
         }
         scores  = [r["risk_score"] for r in results]
         overall = round(0.70 * max(scores) + 0.30 * (sum(scores) / len(scores)), 3)
+        # v5.3: single source of truth for thresholds
+        level, _ = level_from_score(overall)
         return {
             "overall_risk":  overall,
             "overall_level": level,
 manager = ModelManager()
 # ═══════════════════════════════════════════════════════════════════════════════
+# UI helpers (unchanged from v5.2)
 # ═══════════════════════════════════════════════════════════════════════════════
 LEVEL_COLOR = {"Low": "🟢", "Medium": "🟡", "High": "🔴"}
 LEVEL_HEX   = {"Low": "#10b981", "Medium": "#f59e0b", "High": "#ef4444"}
     if not breakdown:
         return ""
     w = breakdown["weights"]
     return f"""
 <div class="cx-breakdown">
   <div class="cx-bd-row">
     <span class="cx-bd-k">Final</span>
     <span class="cx-bd-final-v">{breakdown['final']:.3f}</span>
   </div>
 </div>"""
 # ═══════════════════════════════════════════════════════════════════════════════
+# Analysis flow (unchanged structurally)
 # ═══════════════════════════════════════════════════════════════════════════════
 def _run_analysis(text: str):
     if not text or len(text.strip()) < 30:
 </div>
 {model_note}"""
     top_parts = ['<div class="cx-section-title">🔥 Top Risk Clauses</div>',
                  '<div class="cx-top-grid">']
     for r in doc["top_risks"]:
     top_parts.append("</div>")
     top_html = "\n".join(top_parts)
     rows = [
         "## 📄 All Clauses\n",
         "| # | Marker | Level | Score | Confidence | Symbolic | Preview |",
         f"{r['clause_text'][:55].replace(chr(10), ' ')}…"
         for r in doc["clauses"]
     ]
     pdf_update = gr.update(visible=True, value=None)
     return summary_html, top_html, breakdown_md, gr.update(choices=clause_choices, value=None), doc, pdf_update
 def show_clause_explanation(choice: str, doc_state: dict):
     if not choice or not doc_state:
         return '<div class="cx-empty">← Select a clause above to see its full legal analysis.</div>'
     explanation = generate_explanation(r["clause_text"], r)
     if nl_summarizer is not None:
         nl_text = nl_summarizer.generate_summary(explanation, r["clause_text"])
         explanation["natural_language_summary"] = nl_text
         r["nl_summary"] = nl_text
     else:
         explanation["natural_language_summary"] = ""
     lime_words = []
     if local_explainer is not None and build_predict_fn_for_manager is not None:
         try:
             manager.ensure_loaded()
             predict_fn = build_predict_fn_for_manager(manager)
+            lime_words = local_explainer.explain_with_lime(r["clause_text"], predict_fn)
         except Exception as e:
             print(f"[WARN] LIME path failed: {e}")
     attn_tokens = []
     if local_explainer is not None and manager.is_ready:
         try:
         except Exception as e:
             print(f"[WARN] Attention path failed: {e}")
     lvl   = r["risk_level_raw"]
     color = LEVEL_HEX.get(lvl, "#6b7280")
     cpct  = int(r["risk_score"] * 100)
         f'<div class="cx-pills">{evidence_pills}</div>'
     ) if evidence_pills else ""
     nl_block = ""
     nl_text  = explanation.get("natural_language_summary", "").strip()
     if nl_text:
             f'<div class="cx-nl">{nl_text}</div>'
         )
     lime_block = ""
     if lime_words:
+        lime_block = (f'<div class="cx-section-label">🧪 LIME — Key Legal Terms Driving Risk</div>'
                       f'{lime_html(lime_words)}')
     attn_block = ""
     if attn_tokens:
         attn_block = (f'<div class="cx-section-label">👁️ Attention Heatmap</div>'
                       f'{attention_heatmap_html(attn_tokens)}')
     bd_text_block = ""
     if explanation.get("score_breakdown_text"):
         bd_text_block = f'<div class="cx-bd-text">{explanation["score_breakdown_text"]}</div>'
     rules_html = ""
     for rule_data in explanation.get("rules") or []:
         rid   = rule_data["rule_id"]
 </div>"""
 def build_pdf_report(doc_state: dict):
     if not doc_state:
         return gr.update(visible=False, value=None)
         return gr.update(visible=True, value=None)
 EXAMPLES = [
     ("⚡ High Risk", """1. Liability Cap
 The total liability of either party shall not exceed Rs. 50,000 under any circumstances,
 2. Non-Compete
 Employee shall not compete with the company in any capacity for 3 years following
 termination of this agreement, within the territory of India.
+3. Indemnity
 The Service Provider shall indemnify and hold harmless the Client against any and all
 claims, damages, losses, and expenses arising out of or related to this agreement."""),
 the seat of arbitration in Singapore.
 3. Pricing
 The Company may modify the prices and fees charged under this agreement at
+its sole discretion to modify the terms upon written notice."""),
     ("🟢 Low Risk", """1. Renewal
 This agreement renews automatically every year unless either party provides
 30 days written notice before the renewal date.
 2. Governing Law
 This agreement is governed by the laws of India."""),
+    ("🧪 Benign (M&A-style)", """Compensation paid hereunder shall be exclusive of the Company's
+contributions to statutory benefits. Payment of the closing bonus is
+contingent on the occurrence of the closing of the merger transaction
+and continued employment through such date."""),
 ]
 CUSTOM_CSS = """
 @import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,400&family=DM+Mono:wght@400;500&display=swap');
 * { box-sizing: border-box; }
 body, .gradio-container { background:#080d1a !important; font-family:'DM Sans',sans-serif !important; color:#e2e8f0 !important; }
 footer { display:none !important; }
 .gradio-container { max-width:1080px !important; margin:0 auto !important; }
+.cx-hero { text-align:center; padding:52px 24px 36px; background:linear-gradient(135deg,#0f172a 0%,#1a1040 60%,#0f172a 100%); border-radius:16px; margin-bottom:8px; position:relative; overflow:hidden; }
+.cx-hero::before { content:''; position:absolute; inset:0; background:radial-gradient(ellipse 70% 60% at 50% -10%,#6366f135 0%,transparent 70%); pointer-events:none; }
 .cx-hero-icon { font-size:44px; margin-bottom:14px; }
+.cx-hero-title { font-size:38px; font-weight:700; letter-spacing:-.025em; background:linear-gradient(135deg,#f1f5f9 20%,#a5b4fc 80%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; margin:0 0 10px; line-height:1.1; }
 .cx-hero-sub { font-size:15px; color:#94a3b8; margin:0 0 22px; font-weight:400; }
 .cx-badges { display:flex; gap:8px; flex-wrap:wrap; justify-content:center; }
+.cx-badge-hero { font-size:11px; font-weight:600; letter-spacing:.07em; text-transform:uppercase; padding:5px 12px; border-radius:20px; border:1px solid #2d3a55; background:#131c30; color:#8b9fc7; }
+.cx-model-notice { background:#111827; border:1px solid #1e293b; border-radius:10px; padding:11px 16px; font-size:13px; color:#94a3b8; display:flex; align-items:center; gap:10px; margin-bottom:4px; }
 .cx-model-notice strong { color:#a5b4fc; }
 .cx-card { background:#111827; border:1px solid #1e293b; border-radius:12px; padding:16px 20px; }
 .cx-summary-grid { display:grid; grid-template-columns:180px 1fr 1fr 1fr 1fr; gap:12px; align-items:stretch; margin:4px 0 8px; }
 .cx-bd-formula { color:#a5b4fc; font-size:13px; padding:4px 0 8px; }
 .cx-bd-final { display:grid; grid-template-columns:80px 1fr; padding-top:4px; }
 .cx-bd-final-v { color:#34d399; font-weight:700; font-size:16px; }
 .cx-bd-text { font-family:'DM Mono',monospace; font-size:12px; color:#94a3b8; padding:6px 14px; }
 .cx-divider { border:none; border-top:1px solid #1a2332; margin:24px 0; }
 .cx-empty { color:#374151; font-size:14px; padding:28px 0; text-align:center; }
 """
 def build_ui():
     with gr.Blocks(
         title="ClauseXplain — AI Legal Risk Dashboard",
 <div class="cx-hero">
   <div class="cx-hero-icon">⚖️</div>
   <h1 class="cx-hero-title">ClauseXplain</h1>
+  <p class="cx-hero-sub">International contract neural backbone, localised via Indian neuro-symbolic legal reasoning</p>
   <div class="cx-badges">
     <span class="cx-badge-hero">ICA 1872</span>
     <span class="cx-badge-hero">DPDPA 2023</span>
         gr.HTML("""
 <div class="cx-model-notice">
   ⏳ &nbsp;The neural model (~2 GB) loads on your <strong>first analysis request</strong> —
+  expect 60–90 s. Per-clause LIME + attention run lazily when you inspect a clause (~15–25 s).
 </div>
 """)
         with gr.Accordion("📄 Full Clause Breakdown", open=False):
             breakdown_out = gr.Markdown("")
         with gr.Row():
             pdf_dl_btn = gr.Button("📥 Download PDF Report", variant="primary")
             pdf_file_out = gr.File(label="Compliance Report", visible=False, interactive=False)

explanation.py CHANGED Viewed

@@ -1,15 +1,10 @@
 # explanation.py
-# Template-based explanation engine — adds risk_breakdown, evidence list,
-# confidence_level, natural_language_summary placeholder, and a formatted
-# score_breakdown_text.
 from __future__ import annotations
 from inference import RULE_FEATURE_DEPS
-# ─────────────────────────────────────────────────────────────────────────────
-# Per-rule explanations (kept identical to v5.1)
-# ─────────────────────────────────────────────────────────────────────────────
 RULE_EXPLANATIONS = {
     "ICA_001": {
         "why":        "Liability is capped even for gross negligence or wilful misconduct.",
@@ -32,9 +27,9 @@ RULE_EXPLANATIONS = {
         "suggestion": "Link the damages figure to a genuine pre-estimate of foreseeable loss.",
     },
     "ICA_005": {
-        "why":        "The clause is contingent on an uncertain event in a way that resembles a wager.",
         "meaning":    "Such agreements are void under Indian Contract Act S.30.",
-        "suggestion": "Remove or restructure the contingency to avoid wagering characteristics.",
     },
     "ICA_006": {
         "why":        "The clause restricts a party from pursuing legal proceedings.",
@@ -42,8 +37,8 @@ RULE_EXPLANATIONS = {
         "suggestion": "Replace with a structured dispute-resolution mechanism (arbitration / mediation).",
     },
     "ICA_007": {
-        "why":        "An indemnity obligation is created without a cap on its monetary exposure.",
-        "meaning":    "You could face unlimited downstream financial liability for third-party claims.",
         "suggestion": "Cap the indemnity at a multiple of contract value and carve out consequential losses.",
     },
     "ICA_008": {
@@ -57,7 +52,7 @@ RULE_EXPLANATIONS = {
         "suggestion": "Set the seat of arbitration in a neutral, accessible Indian city (e.g. Mumbai, Delhi).",
     },
     "ICA_010": {
-        "why":        "Exclusivity is imposed without a defined term, making it open-ended.",
         "meaning":    "Indefinite restraints of trade are typically void under Indian Contract Act S.27.",
         "suggestion": "Fix a clear exclusivity term (e.g. 1-3 years) with defined renewal mechanics.",
     },
@@ -98,9 +93,6 @@ RULE_EXPLANATIONS = {
     },
 }
-# ─────────────────────────────────────────────────────────────────────────────
-# Risk-level prose
-# ─────────────────────────────────────────────────────────────────────────────
 RISK_CONTEXT = {
     "Low":    "This clause appears relatively standard with minimal legal exposure.",
     "Medium": "This clause contains terms that warrant careful review before signing.",
@@ -117,9 +109,6 @@ CATEGORY_CONTEXT = {
 }
-# ─────────────────────────────────────────────────────────────────────────────
-# Helpers
-# ─────────────────────────────────────────────────────────────────────────────
 def _evidence_for_rule(rule_id: str, evidence: dict) -> list[dict]:
     deps = RULE_FEATURE_DEPS.get(rule_id, [])
     snippets: list[dict] = []
@@ -134,7 +123,6 @@ def _evidence_for_rule(rule_id: str, evidence: dict) -> list[dict]:
 def _flat_evidence(evidence: dict) -> list[dict]:
-    """Riya's Prompt 1 evidence shape."""
     out = []
     for feat, hits in (evidence or {}).items():
         for h in hits:
@@ -154,36 +142,14 @@ def _format_score_breakdown_text(breakdown: dict | None, fused: float) -> str:
     nrm = breakdown.get("neural_score", 0.0)
     sym = breakdown.get("symbolic_score", 0.0)
     fin = breakdown.get("final", fused)
-    note = " [floor 0.30 applied]" if breakdown.get("floor_applied") else ""
     return (
         f"Final Score = {fin:.2f} "
-        f"(Neural {nrm:.2f} x {w.get('neural', 0):.2f} + "
-        f"Symbolic {sym:.2f} x {w.get('symbolic', 0):.2f}){note}"
     )
-# ─────────────────────────────────────────────────────────────────────────────
-# Main entry point
-# ─────────────────────────────────────────────────────────────────────────────
 def generate_explanation(text: str, result: dict) -> dict:
-    """
-    Returns:
-        {
-            # ── Original keys (backward compatible) ──
-            "overview":              str,
-            "rules":                 list[dict],   # with per-rule evidence
-            "general_tip":           str,
-            "score_breakdown":       dict | None,
-            "confidence":            dict | None,
-            # ── New keys (Prompt 1) ──
-            "risk_breakdown":        dict   # neural/symbolic/weights/final
-            "evidence":              list[dict],  # flat list across all features
-            "confidence_level":      str    # "Low"|"Medium"|"High"
-            "natural_language_summary": str  # filled by NLSummarizer (placeholder here)
-            "score_breakdown_text":  str   # human-readable formula string
-        }
-    """
     level_raw   = result.get("risk_level_raw", "Low")
     triggered   = result.get("triggered_rules", [])
     top_cats    = result.get("top_risk_cats", [])
@@ -192,7 +158,6 @@ def generate_explanation(text: str, result: dict) -> dict:
     breakdown   = result.get("score_breakdown")
     confidence  = result.get("confidence") or {}
-    # ── Overview sentence ──────────────────────────────────────────────────
     primary_cat = top_cats[0][0] if top_cats else "structural"
     cat_desc    = CATEGORY_CONTEXT.get(primary_cat, "legal concerns")
     overview    = (
@@ -200,7 +165,6 @@ def generate_explanation(text: str, result: dict) -> dict:
         f"The primary concern is {cat_desc} (fused risk score: {risk_score:.2f})."
     )
-    # ── Per-rule explanations ──────────────────────────────────────────────
     rule_details = []
     for rule in triggered:
         rid  = rule.get("rule_id", "")
@@ -217,7 +181,6 @@ def generate_explanation(text: str, result: dict) -> dict:
             "evidence":  _evidence_for_rule(rid, evidence),
         })
-    # ── General tip ────────────────────────────────────────────────────────
     general_tip = ""
     if not triggered:
         if level_raw == "Low":
@@ -229,31 +192,28 @@ def generate_explanation(text: str, result: dict) -> dict:
             general_tip = ("High neural risk score despite no specific rule triggers. "
                            "The clause may contain broad or one-sided language - seek legal review.")
-    # ── Risk breakdown dict (Prompt 1) ─────────────────────────────────────
     risk_breakdown = breakdown or {
         "neural_score":   result.get("neural_score", 0.0),
         "symbolic_score": result.get("symbolic_score", 0.0),
-        "weights":        {"neural": 0.60, "symbolic": 0.40},
         "raw_fused":      risk_score,
         "floor_applied":  False,
         "final":          risk_score,
-        "formula":        f"({0.60} × {result.get('neural_score', 0):.3f}) + "
-                          f"({0.40} × {result.get('symbolic_score', 0):.3f}) "
                           f"= {risk_score:.3f}",
     }
     return {
-        # Original / backward-compatible
         "overview":                  overview,
         "rules":                     rule_details,
         "general_tip":               general_tip,
         "score_breakdown":           breakdown,
         "confidence":                confidence,
-        # Prompt 1 additions
         "risk_breakdown":            risk_breakdown,
         "evidence":                  _flat_evidence(evidence),
         "confidence_level":          confidence.get("level", "Medium"),
-        "natural_language_summary":  "",   # filled later by NLSummarizer
         "score_breakdown_text":      _format_score_breakdown_text(breakdown, risk_score),
     }

 # explanation.py
+# v5.3 — fallback risk_breakdown weights aligned to new fusion config.
 from __future__ import annotations
 from inference import RULE_FEATURE_DEPS
 RULE_EXPLANATIONS = {
     "ICA_001": {
         "why":        "Liability is capped even for gross negligence or wilful misconduct.",
         "suggestion": "Link the damages figure to a genuine pre-estimate of foreseeable loss.",
     },
     "ICA_005": {
+        "why":        "The clause uses gambling, wagering, or betting vocabulary.",
         "meaning":    "Such agreements are void under Indian Contract Act S.30.",
+        "suggestion": "Remove or restructure the wagering element of this clause.",
     },
     "ICA_006": {
         "why":        "The clause restricts a party from pursuing legal proceedings.",
         "suggestion": "Replace with a structured dispute-resolution mechanism (arbitration / mediation).",
     },
     "ICA_007": {
+        "why":        "An indemnity obligation is paired with uncapped / unlimited liability language.",
+        "meaning":    "You could face open-ended financial exposure for third-party claims.",
         "suggestion": "Cap the indemnity at a multiple of contract value and carve out consequential losses.",
     },
     "ICA_008": {
         "suggestion": "Set the seat of arbitration in a neutral, accessible Indian city (e.g. Mumbai, Delhi).",
     },
     "ICA_010": {
+        "why":        "Exclusivity rights are granted without a defined term, making them open-ended.",
         "meaning":    "Indefinite restraints of trade are typically void under Indian Contract Act S.27.",
         "suggestion": "Fix a clear exclusivity term (e.g. 1-3 years) with defined renewal mechanics.",
     },
     },
 }
 RISK_CONTEXT = {
     "Low":    "This clause appears relatively standard with minimal legal exposure.",
     "Medium": "This clause contains terms that warrant careful review before signing.",
 }
 def _evidence_for_rule(rule_id: str, evidence: dict) -> list[dict]:
     deps = RULE_FEATURE_DEPS.get(rule_id, [])
     snippets: list[dict] = []
 def _flat_evidence(evidence: dict) -> list[dict]:
     out = []
     for feat, hits in (evidence or {}).items():
         for h in hits:
     nrm = breakdown.get("neural_score", 0.0)
     sym = breakdown.get("symbolic_score", 0.0)
     fin = breakdown.get("final", fused)
     return (
         f"Final Score = {fin:.2f} "
+        f"(Neural {nrm:.2f} × {w.get('neural', 0):.2f} + "
+        f"Symbolic {sym:.2f} × {w.get('symbolic', 0):.2f})"
     )
 def generate_explanation(text: str, result: dict) -> dict:
     level_raw   = result.get("risk_level_raw", "Low")
     triggered   = result.get("triggered_rules", [])
     top_cats    = result.get("top_risk_cats", [])
     breakdown   = result.get("score_breakdown")
     confidence  = result.get("confidence") or {}
     primary_cat = top_cats[0][0] if top_cats else "structural"
     cat_desc    = CATEGORY_CONTEXT.get(primary_cat, "legal concerns")
     overview    = (
         f"The primary concern is {cat_desc} (fused risk score: {risk_score:.2f})."
     )
     rule_details = []
     for rule in triggered:
         rid  = rule.get("rule_id", "")
             "evidence":  _evidence_for_rule(rid, evidence),
         })
     general_tip = ""
     if not triggered:
         if level_raw == "Low":
             general_tip = ("High neural risk score despite no specific rule triggers. "
                            "The clause may contain broad or one-sided language - seek legal review.")
+    # v5.3: fallback weights updated to new neural-dominant config
     risk_breakdown = breakdown or {
         "neural_score":   result.get("neural_score", 0.0),
         "symbolic_score": result.get("symbolic_score", 0.0),
+        "weights":        {"neural": 0.75, "symbolic": 0.25},
         "raw_fused":      risk_score,
         "floor_applied":  False,
         "final":          risk_score,
+        "formula":        f"(0.75 × {result.get('neural_score', 0):.3f}) + "
+                          f"(0.25 × {result.get('symbolic_score', 0):.3f}) "
                           f"= {risk_score:.3f}",
     }
     return {
         "overview":                  overview,
         "rules":                     rule_details,
         "general_tip":               general_tip,
         "score_breakdown":           breakdown,
         "confidence":                confidence,
         "risk_breakdown":            risk_breakdown,
         "evidence":                  _flat_evidence(evidence),
         "confidence_level":          confidence.get("level", "Medium"),
+        "natural_language_summary":  "",
         "score_breakdown_text":      _format_score_breakdown_text(breakdown, risk_score),
     }

feature_extractor.py CHANGED Viewed

@@ -1,186 +1,206 @@
 # feature_extractor.py
-# Synonym-aware regex feature extractor with rich evidence trace.
-# Moved out of app.py to be reusable by LIME wrappers, report generation,
-# and downstream explainability tooling.
 from __future__ import annotations
 import re
 class ClauseFeatureExtractor:
-    """
-    Regex-based hybrid extractor.
-    Public API:
-      • extract(text) -> (features, evidence_dict)
-            features: dict[str, bool|int]
-            evidence_dict: dict[feature_name, list[hit]]
-                hit = {"phrase": str, "span": [start, end], "label": str}
-      • extract_unified(text) -> dict
-            Returns the format requested in Riya's Prompt 3:
-                {
-                    "<feature_name>": True/False/int,
-                    ...,
-                    "evidence": [
-                        {"feature": str, "keywords": [str, ...],
-                         "evidence_text": str, "span": [start, end]},
-                        ...
-                    ],
-                }
-      • flatten_evidence(evidence_dict) -> list[dict]
-            Convert the nested evidence dict to a flat list of hits
-            (one hit per matched phrase).
-    """
     BOOLEAN_PATTERNS: dict[str, list[tuple[re.Pattern, str]]] = {
         "has_liability_cap": [
-            (re.compile(r"\bshall\s+not\s+exceed\b", re.I),                                "shall not exceed"),
             (re.compile(r"\b(?:maximum|max\.?|total|aggregate|cumulative)\s+liabilit(?:y|ies)\b", re.I), "max liability"),
-            (re.compile(r"\bcap(?:ped)?\s+(?:on\s+|of\s+)?liabilit(?:y|ies)\b", re.I),     "cap on liability"),
             (re.compile(r"\blimited\s+to\s+(?:an?\s+amount|the\s+(?:greater|lesser)|rs\.?|inr|usd|\$|₹)", re.I), "limited to (amount)"),
-            (re.compile(r"\bliability\s+(?:is\s+|shall\s+be\s+)?limited\b", re.I),         "liability limited"),
         ],
         "excludes_gross_negligence": [
-            (re.compile(r"\bgross\s+negligen(?:ce|t)\b", re.I),                            "gross negligence"),
-            (re.compile(r"\bwil?l?ful\s+(?:misconduct|default|breach)\b", re.I),           "wilful misconduct"),
-            (re.compile(r"\bintentional\s+(?:misconduct|breach|wrongdoing|act)\b", re.I),  "intentional misconduct"),
-            (re.compile(r"\brecklessness?\b", re.I),                                       "recklessness"),
-            (re.compile(r"\bbad\s+faith\b", re.I),                                         "bad faith"),
-            (re.compile(r"\bfraud(?:ulent)?\b", re.I),                                     "fraud"),
         ],
         "has_liquidated_damages": [
-            (re.compile(r"\bliquidated\s+damages?\b", re.I),                               "liquidated damages"),
             (re.compile(r"\bpre[\-\s]?(?:determined|estimated|agreed)\s+damages?\b", re.I), "pre-determined damages"),
-            (re.compile(r"\bpenalty\s+(?:amount|sum|of)\b", re.I),                         "penalty amount"),
         ],
         "damages_exceed_loss": [
             (re.compile(r"\bregardless\s+of\s+(?:actual\s+)?(?:loss|damage|harm)\b", re.I), "regardless of loss"),
-            (re.compile(r"\birrespective\s+of\s+(?:actual\s+)?(?:loss|damage)\b", re.I),   "irrespective of loss"),
-            (re.compile(r"\bwithout\s+proof\s+of\s+(?:damage|loss)\b", re.I),              "without proof of loss"),
-            (re.compile(r"\bpenalty\s+clause\b", re.I),                                    "penalty clause"),
         ],
         "is_wagering_clause": [
-            (re.compile(r"\bcontingent\s+on\s+(?:the\s+)?(?:outcome|happening|occurrence)\b", re.I), "contingent on outcome"),
-            (re.compile(r"\b(?:wager|bet|bets|betting|gamble|gambling|speculative)\b", re.I), "wager/bet/speculative"),
-            (re.compile(r"\bdepends?\s+(?:entirely\s+)?on\s+(?:an\s+)?uncertain\s+event\b", re.I), "uncertain event"),
         ],
         "restrains_legal_proceedings": [
             (re.compile(r"\bwaive(?:s|d)?\s+(?:the\s+)?right\s+to\s+(?:sue|bring|file|institute)\b", re.I), "waive right to sue"),
             (re.compile(r"\b(?:shall|will)\s+not\s+(?:bring|file|commence|institute)\s+(?:any\s+)?(?:action|suit|proceeding)\b", re.I), "no legal action"),
-            (re.compile(r"\brelinquish(?:es|ed)?\s+(?:the\s+)?right\b", re.I),             "relinquish right"),
-            (re.compile(r"\b(?:no|barred\s+from)\s+legal\s+proceedings?\b", re.I),         "no legal proceedings"),
         ],
         "unilateral_termination": [
-            (re.compile(r"\bmay\s+terminate\b", re.I),                                     "may terminate"),
-            (re.compile(r"\bsole\s+(?:discretion|option)\b", re.I),                        "sole discretion"),
             (re.compile(r"\bterminate.*?\b(?:without\s+cause|for\s+convenience|at\s+will|unilaterally)\b", re.I), "without cause"),
-            (re.compile(r"\bat\s+(?:its|the)\s+(?:sole\s+)?discretion\b", re.I),           "at its discretion"),
         ],
         "notice_period_defined": [
             (re.compile(r"\b\d+\s*(?:days?|weeks?|months?)\s*(?:prior\s+)?(?:written\s+)?notice\b", re.I), "N days notice"),
-            (re.compile(r"\bnotice\s+period\s+of\s+\d+\b", re.I),                          "notice period of N"),
         ],
         "processes_personal_data": [
-            (re.compile(r"\bpersonal\s+(?:data|information)\b", re.I),                     "personal data"),
             (re.compile(r"\b(?:collect|process|handle|store)\s+(?:and\s+\w+\s+)?(?:user|customer|individual)\s+(?:data|information)\b", re.I), "process user data"),
-            (re.compile(r"\bdata\s+(?:subject|principal)\b", re.I),                        "data subject/principal"),
-            (re.compile(r"\bpii\b", re.I),                                                 "PII"),
         ],
         "processes_sensitive_data": [
-            (re.compile(r"\bsensitive\s+personal\s+(?:data|information)\b", re.I),         "sensitive personal data"),
             (re.compile(r"\b(?:health|medical|financial|biometric|aadhaar|aadhar)\s+(?:data|information|details)\b", re.I), "sensitive category"),
         ],
         "has_data_retention_clause": [
-            (re.compile(r"\bretention\s+period\b", re.I),                                  "retention period"),
             (re.compile(r"\bretain(?:ed|s)?\s+(?:for\s+)?(?:a\s+period\s+of\s+)?\d+\b", re.I), "retain for N"),
             (re.compile(r"\bdata\s+shall\s+be\s+(?:deleted|purged|anonymised|destroyed)\b", re.I), "data deletion"),
-            (re.compile(r"\bpurge|anonymise|delete\s+(?:after|upon)\b", re.I),             "purge/delete after"),
         ],
         "has_breach_notification": [
-            (re.compile(r"\bbreach\s+notification\b", re.I),                               "breach notification"),
-            (re.compile(r"\bnotify\s+(?:of\s+)?(?:any\s+)?(?:data\s+)?breach\b", re.I),    "notify of breach"),
             (re.compile(r"\b(?:report|inform|notify)\s+(?:the\s+\w+\s+)?within\s+\d+\s*(?:hours?|days?)\b", re.I), "notify within N"),
-            (re.compile(r"\bsecurity\s+incident\b", re.I),                                 "security incident"),
         ],
         "has_consent_clause": [
             (re.compile(r"\b(?:with|upon|after)\s+(?:the\s+)?(?:prior\s+|explicit\s+|written\s+)?consent\s+of\b", re.I), "with consent of"),
-            (re.compile(r"\bopt[\-\s]?in\b", re.I),                                        "opt-in"),
-            (re.compile(r"\bexplicit\s+consent\b", re.I),                                  "explicit consent"),
         ],
         "handles_digital_data": [
-            (re.compile(r"\b(?:digital|electronic|online|cloud[\-\s]?based)\b", re.I),     "digital/online/cloud"),
-            (re.compile(r"\b(?:server|database|api|software\s+platform|saas)\b", re.I),    "server/db/SaaS"),
         ],
         "has_security_clause": [
-            (re.compile(r"\b(?:reasonable\s+)?security\s+measures?\b", re.I),              "security measures"),
-            (re.compile(r"\bencryption\b", re.I),                                          "encryption"),
-            (re.compile(r"\bcyber[\-\s]?security\b", re.I),                                "cybersecurity"),
-            (re.compile(r"\baccess\s+controls?\b", re.I),                                  "access controls"),
-            (re.compile(r"\biso\s*27001\b", re.I),                                         "ISO 27001"),
         ],
         "assigns_all_ip": [
-            (re.compile(r"\ball\s+intellectual\s+property\b", re.I),                       "all intellectual property"),
-            (re.compile(r"\bassigns?\s+(?:all\s+)?(?:rights?|ip|intellectual)\b", re.I),   "assigns all IP"),
             (re.compile(r"\b(?:belongs|vests|shall\s+vest)\s+(?:in|to)\s+(?:the\s+)?(?:client|company|customer|employer)\b", re.I), "vests in client"),
         ],
         "includes_pre_existing_ip": [
-            (re.compile(r"\bpre[\-\s]?existing\b", re.I),                                  "pre-existing"),
-            (re.compile(r"\bbackground\s+(?:ip|intellectual\s+property)\b", re.I),         "background IP"),
-            (re.compile(r"\bprior\s+to\s+(?:the\s+)?engagement\b", re.I),                  "prior to engagement"),
         ],
         "is_consumer_contract": [
             (re.compile(r"\b(?:consumer|end[\-\s]?user|retail\s+customer|individual\s+customer)\b", re.I), "consumer/end-user"),
         ],
         "has_one_sided_clause": [
-            (re.compile(r"\bsole\s+discretion\b", re.I),                                   "sole discretion"),
-            (re.compile(r"\bwithout\s+(?:any\s+)?liability\b", re.I),                      "without liability"),
-            (re.compile(r"\bno\s+obligation\s+(?:to|whatsoever)\b", re.I),                 "no obligation"),
-            (re.compile(r"\babsolute\s+(?:right|discretion)\b", re.I),                     "absolute right"),
-            (re.compile(r"\bunconditionally\b", re.I),                                     "unconditionally"),
         ],
-        # Indemnity
         "has_indemnity_clause": [
-            (re.compile(r"\bindemnif(?:y|ies|ied|ication)\b", re.I),                       "indemnify/indemnification"),
-            (re.compile(r"\bhold\s+harmless\b", re.I),                                     "hold harmless"),
-            (re.compile(r"\bdefend\s+(?:and\s+indemnify|the\s+\w+\s+against)\b", re.I),    "defend & indemnify"),
         ],
         "indemnity_capped": [
             (re.compile(r"\bindemn\w*[^.]{0,80}?\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)\b", re.I), "indemnity cap"),
             (re.compile(r"\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)[^.]{0,80}?\bindemn\w*\b", re.I), "indemnity cap"),
         ],
-        # Auto-renewal
         "has_auto_renewal": [
-            (re.compile(r"\bauto(?:matically)?[\-\s]?renew(?:s|al|ed|ing)?\b", re.I),      "auto-renew"),
-            (re.compile(r"\brenew(?:s|al)?\s+automatically\b", re.I),                      "renews automatically"),
-            (re.compile(r"\bevergreen\s+(?:clause|term)\b", re.I),                         "evergreen clause"),
-            (re.compile(r"\bshall\s+continue\s+(?:to\s+)?renew\b", re.I),                  "continue to renew"),
         ],
         "has_opt_out_window": [
             (re.compile(r"\b\d+\s*(?:days?|weeks?|months?)[^.]{0,60}?(?:prior\s+to|before)\s+(?:the\s+)?(?:renewal|expiry|expiration|end)\b", re.I), "notice before renewal"),
-            (re.compile(r"\bnon[\-\s]?renewal\s+notice\b", re.I),                          "non-renewal notice"),
-            (re.compile(r"\b(?:opt[\-\s]?out|terminate)\s+(?:the\s+)?renewal\b", re.I),    "opt-out renewal"),
         ],
-        # Arbitration
         "has_arbitration": [
-            (re.compile(r"\barbitrat(?:ion|or|al)\b", re.I),                               "arbitration"),
-            (re.compile(r"\barbitral\s+tribunal\b", re.I),                                 "arbitral tribunal"),
         ],
         "arbitration_distant_venue": [
-            (re.compile(r"\b(?:seat|venue|place)\s+of\s+arbitration\s+(?:shall\s+be\s+|is\s+)?(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign seat"),
-            (re.compile(r"\barbitrat\w*[^.]{0,60}?\b(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign arbitration venue"),
         ],
-        # Exclusivity
         "has_exclusivity": [
-            (re.compile(r"\bexclusiv(?:e|ity|ely)\b", re.I),                               "exclusive/exclusivity"),
-            (re.compile(r"\bsole\s+(?:and\s+exclusive\s+)?(?:supplier|provider|vendor|distributor|licensee)\b", re.I), "sole & exclusive"),
         ],
         "exclusivity_term_defined": [
-            (re.compile(r"\bexclusiv\w*[^.]{0,80}?\b(?:for\s+a\s+period\s+of|until|through|expires?|terminates?)\b[^.]{0,30}?\d+\b", re.I), "exclusivity term"),
-            (re.compile(r"\bexclusiv\w*[^.]{0,80}?\b\d+\s*(?:years?|months?)\b", re.I),    "exclusivity duration"),
         ],
-        # Unilateral pricing
         "unilateral_price_change": [
             (re.compile(r"\bmay\s+(?:change|modify|adjust|revise|increase|update)\s+(?:the\s+)?(?:prices?|fees?|charges?|rates?)\b", re.I), "may change prices"),
             (re.compile(r"\b(?:prices?|fees?|charges?|rates?)\s+(?:may\s+be|are\s+subject\s+to)\s+(?:changed|modified|adjusted|revised)\s+at\s+(?:our|its|the\s+\w+s?)\s+(?:sole\s+)?discretion\b", re.I), "prices changed at discretion"),
@@ -197,7 +217,6 @@ class ClauseFeatureExtractor:
         "has_exclusivity":          ["exclusivity_term_defined"],
     }
-    # ── Core extraction ────────────────────────────────────────────────────
     def extract(self, text: str) -> tuple[dict, dict]:
         features: dict = {}
         evidence: dict = {}
@@ -229,6 +248,7 @@ class ClauseFeatureExtractor:
                     if child not in features:
                         features[child] = False
         m = re.search(
             r'(\d+)\s*(?:\(\w+\)\s*)?\s*years?\s*(?:of\s+)?(?:the\s+)?non[\-\s]?compet|'
             r'non[\-\s]?compet[a-z]*[^.]{0,40}?(\d+)\s*years?',
@@ -243,15 +263,13 @@ class ClauseFeatureExtractor:
                 "span":   [start, end],
                 "label":  f"{yrs}-year non-compete",
             }]
-        elif any(re.search(p, text, re.I) for p in (
-            r"\bnon[\-\s]?compete\b", r"\bshall\s+not\s+compete\b")):
             features["non_compete_years"] = 1
         return features, evidence
-    # ── Convenience accessors ──────────────────────────────────────────────
     def extract_unified(self, text: str) -> dict:
-        """Riya's Prompt 3 format: features merged with a flat evidence list."""
         features, evidence_dict = self.extract(text)
         out = dict(features)
         out["evidence"] = self.flatten_evidence(evidence_dict)

 # feature_extractor.py
+# v5.3 — precision-tightened regex pack.
+# Critical changes:
+#   • is_wagering_clause: strict gambling vocab only (no "contingent on …")
+#   • has_exclusivity: contextual phrases only (no bare "exclusive" / "exclusive of")
+#   • has_uncapped_signal: NEW — gates ICA_007 to require explicit uncapped language
 from __future__ import annotations
 import re
 class ClauseFeatureExtractor:
     BOOLEAN_PATTERNS: dict[str, list[tuple[re.Pattern, str]]] = {
+        # ── Liability cap (unchanged) ──────────────────────────────────────
         "has_liability_cap": [
+            (re.compile(r"\bshall\s+not\s+exceed\b", re.I), "shall not exceed"),
             (re.compile(r"\b(?:maximum|max\.?|total|aggregate|cumulative)\s+liabilit(?:y|ies)\b", re.I), "max liability"),
+            (re.compile(r"\bcap(?:ped)?\s+(?:on\s+|of\s+)?liabilit(?:y|ies)\b", re.I), "cap on liability"),
             (re.compile(r"\blimited\s+to\s+(?:an?\s+amount|the\s+(?:greater|lesser)|rs\.?|inr|usd|\$|₹)", re.I), "limited to (amount)"),
+            (re.compile(r"\bliability\s+(?:is\s+|shall\s+be\s+)?limited\b", re.I), "liability limited"),
         ],
         "excludes_gross_negligence": [
+            (re.compile(r"\bgross\s+negligen(?:ce|t)\b", re.I), "gross negligence"),
+            (re.compile(r"\bwil?l?ful\s+(?:misconduct|default|breach)\b", re.I), "wilful misconduct"),
+            (re.compile(r"\bintentional\s+(?:misconduct|breach|wrongdoing|act)\b", re.I), "intentional misconduct"),
+            (re.compile(r"\brecklessness?\b", re.I), "recklessness"),
+            (re.compile(r"\bbad\s+faith\b", re.I), "bad faith"),
+            (re.compile(r"\bfraud(?:ulent)?\b", re.I), "fraud"),
         ],
+        # ── Liquidated damages (unchanged) ─────────────────────────────────
         "has_liquidated_damages": [
+            (re.compile(r"\bliquidated\s+damages?\b", re.I), "liquidated damages"),
             (re.compile(r"\bpre[\-\s]?(?:determined|estimated|agreed)\s+damages?\b", re.I), "pre-determined damages"),
+            (re.compile(r"\bpenalty\s+(?:amount|sum|of)\b", re.I), "penalty amount"),
         ],
         "damages_exceed_loss": [
             (re.compile(r"\bregardless\s+of\s+(?:actual\s+)?(?:loss|damage|harm)\b", re.I), "regardless of loss"),
+            (re.compile(r"\birrespective\s+of\s+(?:actual\s+)?(?:loss|damage)\b", re.I), "irrespective of loss"),
+            (re.compile(r"\bwithout\s+proof\s+of\s+(?:damage|loss)\b", re.I), "without proof of loss"),
+            (re.compile(r"\bpenalty\s+clause\b", re.I), "penalty clause"),
         ],
+        # ── WAGERING — TIGHTENED (no "contingent on"; explicit gambling vocab only)
         "is_wagering_clause": [
+            (re.compile(r"\b(?:wager|wagers|wagering|wagered)\b", re.I), "wager"),
+            (re.compile(r"\bgambling\b", re.I), "gambling"),
+            (re.compile(r"\blotter(?:y|ies)\b", re.I), "lottery"),
+            (re.compile(r"\bbetting\b", re.I), "betting"),
+            # "bet/bets on X" or "bet against X" — require directional preposition
+            (re.compile(r"\bbets?\s+(?:on|against)\s+(?:the|any|an|a)\s+\w+", re.I), "bet on/against"),
+            # "stake on X" or "stake against X" — same treatment
+            (re.compile(r"\bstakes?\s+(?:on|against)\s+(?:the|any|an|a)\s+\w+", re.I), "stake on/against"),
         ],
         "restrains_legal_proceedings": [
             (re.compile(r"\bwaive(?:s|d)?\s+(?:the\s+)?right\s+to\s+(?:sue|bring|file|institute)\b", re.I), "waive right to sue"),
             (re.compile(r"\b(?:shall|will)\s+not\s+(?:bring|file|commence|institute)\s+(?:any\s+)?(?:action|suit|proceeding)\b", re.I), "no legal action"),
+            (re.compile(r"\brelinquish(?:es|ed)?\s+(?:the\s+)?right\s+to\s+(?:sue|claim|recover)\b", re.I), "relinquish right (to sue)"),
+            (re.compile(r"\b(?:no|barred\s+from)\s+legal\s+proceedings?\b", re.I), "no legal proceedings"),
         ],
+        # ── Termination (unchanged) ────────────────────────────────────────
         "unilateral_termination": [
+            (re.compile(r"\bmay\s+terminate\b", re.I), "may terminate"),
             (re.compile(r"\bterminate.*?\b(?:without\s+cause|for\s+convenience|at\s+will|unilaterally)\b", re.I), "without cause"),
+            (re.compile(r"\bat\s+(?:its|the)\s+sole\s+discretion\b[^.]{0,40}?\bterminat", re.I), "terminate at sole discretion"),
         ],
         "notice_period_defined": [
             (re.compile(r"\b\d+\s*(?:days?|weeks?|months?)\s*(?:prior\s+)?(?:written\s+)?notice\b", re.I), "N days notice"),
+            (re.compile(r"\bnotice\s+period\s+of\s+\d+\b", re.I), "notice period of N"),
         ],
+        # ── DPDPA / IT Act / personal-data signals (unchanged) ─────────────
         "processes_personal_data": [
+            (re.compile(r"\bpersonal\s+(?:data|information)\b", re.I), "personal data"),
             (re.compile(r"\b(?:collect|process|handle|store)\s+(?:and\s+\w+\s+)?(?:user|customer|individual)\s+(?:data|information)\b", re.I), "process user data"),
+            (re.compile(r"\bdata\s+(?:subject|principal)\b", re.I), "data subject/principal"),
+            (re.compile(r"\bpii\b", re.I), "PII"),
         ],
         "processes_sensitive_data": [
+            (re.compile(r"\bsensitive\s+personal\s+(?:data|information)\b", re.I), "sensitive personal data"),
             (re.compile(r"\b(?:health|medical|financial|biometric|aadhaar|aadhar)\s+(?:data|information|details)\b", re.I), "sensitive category"),
         ],
         "has_data_retention_clause": [
+            (re.compile(r"\bretention\s+period\b", re.I), "retention period"),
             (re.compile(r"\bretain(?:ed|s)?\s+(?:for\s+)?(?:a\s+period\s+of\s+)?\d+\b", re.I), "retain for N"),
             (re.compile(r"\bdata\s+shall\s+be\s+(?:deleted|purged|anonymised|destroyed)\b", re.I), "data deletion"),
+            (re.compile(r"\b(?:purge|anonymise|delete)\s+(?:after|upon)\b", re.I), "purge/delete after"),
         ],
         "has_breach_notification": [
+            (re.compile(r"\bbreach\s+notification\b", re.I), "breach notification"),
+            (re.compile(r"\bnotify\s+(?:of\s+)?(?:any\s+)?(?:data\s+)?breach\b", re.I), "notify of breach"),
             (re.compile(r"\b(?:report|inform|notify)\s+(?:the\s+\w+\s+)?within\s+\d+\s*(?:hours?|days?)\b", re.I), "notify within N"),
+            (re.compile(r"\bsecurity\s+incident\b", re.I), "security incident"),
         ],
         "has_consent_clause": [
             (re.compile(r"\b(?:with|upon|after)\s+(?:the\s+)?(?:prior\s+|explicit\s+|written\s+)?consent\s+of\b", re.I), "with consent of"),
+            (re.compile(r"\bopt[\-\s]?in\b", re.I), "opt-in"),
+            (re.compile(r"\bexplicit\s+consent\b", re.I), "explicit consent"),
         ],
         "handles_digital_data": [
+            (re.compile(r"\b(?:digital|electronic|online|cloud[\-\s]?based)\b", re.I), "digital/online/cloud"),
+            (re.compile(r"\b(?:server|database|api|software\s+platform|saas)\b", re.I), "server/db/SaaS"),
         ],
         "has_security_clause": [
+            (re.compile(r"\b(?:reasonable\s+)?security\s+measures?\b", re.I), "security measures"),
+            (re.compile(r"\bencryption\b", re.I), "encryption"),
+            (re.compile(r"\bcyber[\-\s]?security\b", re.I), "cybersecurity"),
+            (re.compile(r"\baccess\s+controls?\b", re.I), "access controls"),
+            (re.compile(r"\biso\s*27001\b", re.I), "ISO 27001"),
         ],
+        # ── IP (unchanged) ─────────────────────────────────────────────────
         "assigns_all_ip": [
+            (re.compile(r"\ball\s+intellectual\s+property\b", re.I), "all intellectual property"),
+            (re.compile(r"\bassigns?\s+(?:all\s+)?(?:rights?|ip|intellectual)\b", re.I), "assigns all IP"),
             (re.compile(r"\b(?:belongs|vests|shall\s+vest)\s+(?:in|to)\s+(?:the\s+)?(?:client|company|customer|employer)\b", re.I), "vests in client"),
         ],
         "includes_pre_existing_ip": [
+            (re.compile(r"\bpre[\-\s]?existing\s+(?:ip|intellectual|materials?|works?|inventions?)\b", re.I), "pre-existing"),
+            (re.compile(r"\bbackground\s+(?:ip|intellectual\s+property)\b", re.I), "background IP"),
+            (re.compile(r"\bprior\s+to\s+(?:the\s+)?engagement\b", re.I), "prior to engagement"),
         ],
+        # ── Consumer (one-sided narrowed slightly) ─────────────────────────
         "is_consumer_contract": [
             (re.compile(r"\b(?:consumer|end[\-\s]?user|retail\s+customer|individual\s+customer)\b", re.I), "consumer/end-user"),
         ],
         "has_one_sided_clause": [
+            # "sole discretion" must be paired with a unilateral action verb to
+            # avoid firing on operational discretion language.
+            (re.compile(r"\bsole\s+discretion\b[^.]{0,50}?\b(?:terminate|modify|change|amend|deny|reject|refuse)\b", re.I), "sole discretion to terminate/modify"),
+            (re.compile(r"\bwithout\s+(?:any\s+)?liability\b", re.I), "without liability"),
+            (re.compile(r"\bno\s+obligation\s+(?:to|whatsoever)\b", re.I), "no obligation"),
+            (re.compile(r"\babsolute\s+(?:right|discretion)\b", re.I), "absolute right"),
+            (re.compile(r"\bunconditionally\b", re.I), "unconditionally"),
         ],
+        # ── Indemnity ──────────────────────────────────────────────────────
         "has_indemnity_clause": [
+            (re.compile(r"\bindemnif(?:y|ies|ied|ication)\b", re.I), "indemnify/indemnification"),
+            (re.compile(r"\bhold\s+harmless\b", re.I), "hold harmless"),
+            (re.compile(r"\bdefend\s+(?:and\s+indemnify|the\s+\w+\s+against)\b", re.I), "defend & indemnify"),
         ],
         "indemnity_capped": [
             (re.compile(r"\bindemn\w*[^.]{0,80}?\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)\b", re.I), "indemnity cap"),
             (re.compile(r"\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)[^.]{0,80}?\bindemn\w*\b", re.I), "indemnity cap"),
         ],
+        # ── NEW: uncapped / unlimited liability signals (gates ICA_007) ───
+        "has_uncapped_signal": [
+            (re.compile(r"\bunlimited\s+(?:liabilit(?:y|ies)|exposure|damages?)\b", re.I), "unlimited liability"),
+            (re.compile(r"\bwithout\s+(?:any\s+)?(?:limit|limitation|cap|ceiling)\b", re.I), "without limit"),
+            (re.compile(r"\bno\s+(?:cap|limit|ceiling|maximum)\b", re.I), "no cap"),
+            (re.compile(r"\bany\s+and\s+all\s+(?:claims?|damages?|losses?|expenses?|liabilit(?:y|ies))\b", re.I), "any and all claims/damages"),
+            (re.compile(r"\b(?:all|every)\s+(?:and\s+any\s+)?(?:claims?|damages?|losses?|liabilit(?:y|ies))\s+(?:whatsoever|of\s+any\s+kind)\b", re.I), "all damages whatsoever"),
+        ],
+        # ── Auto-renewal (unchanged) ───────────────────────────────────────
         "has_auto_renewal": [
+            (re.compile(r"\bauto(?:matically)?[\-\s]?renew(?:s|al|ed|ing)?\b", re.I), "auto-renew"),
+            (re.compile(r"\brenew(?:s|al)?\s+automatically\b", re.I), "renews automatically"),
+            (re.compile(r"\bevergreen\s+(?:clause|term)\b", re.I), "evergreen clause"),
+            (re.compile(r"\bshall\s+continue\s+(?:to\s+)?renew\b", re.I), "continue to renew"),
         ],
         "has_opt_out_window": [
             (re.compile(r"\b\d+\s*(?:days?|weeks?|months?)[^.]{0,60}?(?:prior\s+to|before)\s+(?:the\s+)?(?:renewal|expiry|expiration|end)\b", re.I), "notice before renewal"),
+            (re.compile(r"\bnon[\-\s]?renewal\s+notice\b", re.I), "non-renewal notice"),
+            (re.compile(r"\b(?:opt[\-\s]?out|terminate)\s+(?:the\s+)?renewal\b", re.I), "opt-out renewal"),
+            # Plain "30 days notice" near "renew" is a common opt-out
+            (re.compile(r"\brenew\w*[^.]{0,80}?\b\d+\s*(?:days?|weeks?|months?)\s+(?:written\s+)?notice\b", re.I), "N-day notice before renewal"),
         ],
+        # ── Arbitration (unchanged) ────────────────────────────────────────
         "has_arbitration": [
+            (re.compile(r"\barbitrat(?:ion|or|al)\b", re.I), "arbitration"),
+            (re.compile(r"\barbitral\s+tribunal\b", re.I), "arbitral tribunal"),
         ],
         "arbitration_distant_venue": [
+            (re.compile(r"\b(?:seat|venue|place)\s+of\s+arbitration\s+(?:shall\s+be\s+|is\s+|in\s+)?(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign seat"),
+            (re.compile(r"\barbitrat\w*[^.]{0,60}?\b(?:in|at)\s+(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign arbitration venue"),
         ],
+        # ── EXCLUSIVITY — TIGHTENED (won't match "exclusive of …") ────────
         "has_exclusivity": [
+            # Contextual nouns — "exclusive rights", "exclusive license", "exclusive distributor", etc.
+            (re.compile(r"\bexclusive\s+(?:right|rights|licen[sc]e|licen[sc]ee|distributor|supplier|vendor|agent|territory|market|partner|reseller|dealer)\b", re.I), "exclusive [right/license/distributor/...]"),
+            # Canonical exclusivity idioms
+            (re.compile(r"\bsole\s+and\s+exclusive\b", re.I), "sole and exclusive"),
+            (re.compile(r"\bshall\s+exclusively\b", re.I), "shall exclusively"),
+            (re.compile(r"\bgrant(?:s|ed|ing)?\s+(?:an?\s+|the\s+)?exclusive\s+(?:right|rights|licen[sc]e)\b", re.I), "grants exclusive right/license"),
+            # "exclusively to/with/for [party]" — directional, not "exclusive of"
+            (re.compile(r"\bexclusively\s+(?:to|with|for|by)\s+(?:the|its|a)\s+\w+", re.I), "exclusively to/with"),
+            (re.compile(r"\bon\s+an\s+exclusive\s+basis\b", re.I), "on an exclusive basis"),
         ],
         "exclusivity_term_defined": [
+            (re.compile(r"\bexclusiv\w*[^.]{0,80}?\b(?:for\s+a\s+(?:period|term)\s+of|until|through|expires?|terminates?)\b[^.]{0,30}?\d+", re.I), "exclusivity term"),
+            (re.compile(r"\bexclusiv\w*[^.]{0,80}?\b\d+\s*(?:years?|months?)\b", re.I), "exclusivity duration"),
+            (re.compile(r"\bexclusiv\w*[^.]{0,80}?\bduring\s+the\s+(?:term|initial\s+term)\b", re.I), "during the term"),
         ],
+        # ── Pricing (unchanged) ────────────────────────────────────────────
         "unilateral_price_change": [
             (re.compile(r"\bmay\s+(?:change|modify|adjust|revise|increase|update)\s+(?:the\s+)?(?:prices?|fees?|charges?|rates?)\b", re.I), "may change prices"),
             (re.compile(r"\b(?:prices?|fees?|charges?|rates?)\s+(?:may\s+be|are\s+subject\s+to)\s+(?:changed|modified|adjusted|revised)\s+at\s+(?:our|its|the\s+\w+s?)\s+(?:sole\s+)?discretion\b", re.I), "prices changed at discretion"),
         "has_exclusivity":          ["exclusivity_term_defined"],
     }
     def extract(self, text: str) -> tuple[dict, dict]:
         features: dict = {}
         evidence: dict = {}
                     if child not in features:
                         features[child] = False
+        # Non-compete years — accept either ordering, but require explicit context
         m = re.search(
             r'(\d+)\s*(?:\(\w+\)\s*)?\s*years?\s*(?:of\s+)?(?:the\s+)?non[\-\s]?compet|'
             r'non[\-\s]?compet[a-z]*[^.]{0,40}?(\d+)\s*years?',
                 "span":   [start, end],
                 "label":  f"{yrs}-year non-compete",
             }]
+        elif re.search(r"\bnon[\-\s]?compete\b|\bshall\s+not\s+compete\b", text, re.I):
             features["non_compete_years"] = 1
         return features, evidence
+    # ── Convenience accessors (unchanged) ──────────────────────────────────
     def extract_unified(self, text: str) -> dict:
         features, evidence_dict = self.extract(text)
         out = dict(features)
         out["evidence"] = self.flatten_evidence(evidence_dict)

inference.py CHANGED Viewed

@@ -1,20 +1,21 @@
 # inference.py
-# Pure utility functions for neuro-symbolic fusion, confidence estimation,
-# and score-breakdown reporting. No module-level mutable globals — all
-# stateful objects live in ModelManager (app.py).
 from __future__ import annotations
-# ── Clause types that should be weighted toward symbolic rules ──────────────
 IP_CLAUSE_TYPES = {
     "IP Ownership Assignment", "Joint IP Ownership",
     "Irrevocable Or Perpetual License",
     "Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
 }
-# ── Mapping: rule_id -> list of features it depends on ──────────────────────
-# Used by the explanation engine to surface evidence for each triggered rule
-# without inspecting lambda bytecode. Keep in sync with SYMBOLIC_RULES in app.py.
 RULE_FEATURE_DEPS = {
     "ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
     "ICA_002": ["unilateral_termination", "notice_period_defined"],
@@ -22,7 +23,8 @@ RULE_FEATURE_DEPS = {
     "ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
     "ICA_005": ["is_wagering_clause"],
     "ICA_006": ["restrains_legal_proceedings"],
-    "ICA_007": ["has_indemnity_clause", "indemnity_capped"],
     "ICA_008": ["has_auto_renewal", "has_opt_out_window"],
     "ICA_009": ["has_arbitration", "arbitration_distant_venue"],
     "ICA_010": ["has_exclusivity", "exclusivity_term_defined"],
@@ -35,16 +37,22 @@ RULE_FEATURE_DEPS = {
     "CPA_001":   ["is_consumer_contract", "has_one_sided_clause"],
 }
 def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
-    """
-    Evaluate symbolic rules against extracted features.
-    Returns:
-        {
-            "symbolic_score":  float,        # clamped to [0, 1]
-            "triggered_rules": list[dict],   # rules whose condition fired
-        }
-    """
     triggered, total = [], 0.0
     for rule in symbolic_rules:
         try:
@@ -52,7 +60,6 @@ def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
                 triggered.append(rule)
                 total += rule["penalty"]
         except Exception:
-            # A malformed rule must not crash inference.
             pass
     return {
         "symbolic_score":  round(min(total, 1.0), 3),
@@ -66,46 +73,24 @@ def _neuro_symbolic_fusion(
     is_ip_clause: bool = False,
 ) -> dict:
     """
-    Weighted fusion of neural and symbolic scores with a transparent breakdown.
-    IP clauses shift weight toward symbolic rules (which capture IP-specific law).
-    A non-zero symbolic score forces a Medium-or-higher floor (since rule triggers
-    represent deterministic legal violations).
-    Returns:
-        {
-            "score":     float,
-            "level":     "Low" | "Medium" | "High",
-            "emoji":     str,
-            "breakdown": {
-                "neural_score":        float,
-                "symbolic_score":      float,
-                "weights":             {"neural": float, "symbolic": float},
-                "raw_fused":           float,   # pre-floor
-                "floor_applied":       bool,
-                "final":               float,
-                "formula":             str,     # human-readable computation
-            },
-        }
     """
     if is_ip_clause and symbolic > 0:
-        w_n, w_s = 0.35, 0.65
-    else:
         w_n, w_s = 0.60, 0.40
     raw   = w_n * neural + w_s * symbolic
-    floor = symbolic > 0 and raw < 0.30
-    score = max(raw, 0.30) if floor else raw
-    score = round(min(score, 1.0), 3)
-    if   score <= 0.33: level, emoji = "Low",    "🟢"
-    elif score <= 0.66: level, emoji = "Medium", "🟡"
-    else:               level, emoji = "High",   "🔴"
     formula = (
         f"({w_n:.2f} × {neural:.3f}) + ({w_s:.2f} × {symbolic:.3f}) "
         f"= {round(raw, 3)}"
-        + (f"  →  floor 0.30 applied (symbolic triggers present)" if floor else "")
     )
     return {
@@ -117,7 +102,7 @@ def _neuro_symbolic_fusion(
             "symbolic_score": round(symbolic, 3),
             "weights":        {"neural": w_n, "symbolic": w_s},
             "raw_fused":      round(raw, 3),
-            "floor_applied":  bool(floor),
             "final":          score,
             "formula":        formula,
         },
@@ -132,30 +117,19 @@ def _compute_confidence(
     neural_loaded: bool = True,
 ) -> dict:
     """
-    Estimate prediction confidence on three signals:
-      1. boundary_dist  — distance of the fused score from a risk-level boundary
-      2. agreement      — how closely neural and symbolic agree
-      3. rule_strength  — how many symbolic rules fired
-    Returns:
-        {
-            "level":   "Low" | "Medium" | "High",
-            "score":   float,
-            "factors": { "boundary_dist": float, "agreement": float,
-                         "rule_strength": float },
-        }
     """
-    # Distance from the nearest decision boundary (0.33 or 0.66)
-    boundary_dist = min(abs(fused - 0.33), abs(fused - 0.66))
-    dist_factor   = min(boundary_dist / 0.18, 1.0)
-    # Neural vs symbolic agreement (only meaningful if neural is loaded)
     if neural_loaded:
         agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)
     else:
-        agree_factor = 0.5  # neutral when neural is unavailable
-    # Rule signal — more triggers = stronger deterministic evidence
     if   num_triggered == 0: rule_factor = 0.40
     elif num_triggered == 1: rule_factor = 0.70
     else:                    rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)

 # inference.py
+# Hardening v5.3:
+#  - Neural-dominant fusion (default 0.75 / 0.25, IP 0.60 / 0.40)
+#  - Symbolic floor of 0.30 REMOVED
+#  - Risk-level thresholds: Low < 0.50, Medium 0.50–0.80, High > 0.80
+#  - Confidence recalibrated for the new thresholds
+#  - RULE_FEATURE_DEPS updated for tightened ICA_007
 from __future__ import annotations
 IP_CLAUSE_TYPES = {
     "IP Ownership Assignment", "Joint IP Ownership",
     "Irrevocable Or Perpetual License",
     "Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
 }
+# Rule -> feature dependencies. Used by the explanation engine to surface
+# matched evidence per rule (no lambda introspection required).
 RULE_FEATURE_DEPS = {
     "ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
     "ICA_002": ["unilateral_termination", "notice_period_defined"],
     "ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
     "ICA_005": ["is_wagering_clause"],
     "ICA_006": ["restrains_legal_proceedings"],
+    # ICA_007 tightened: now requires has_uncapped_signal too
+    "ICA_007": ["has_indemnity_clause", "indemnity_capped", "has_uncapped_signal"],
     "ICA_008": ["has_auto_renewal", "has_opt_out_window"],
     "ICA_009": ["has_arbitration", "arbitration_distant_venue"],
     "ICA_010": ["has_exclusivity", "exclusivity_term_defined"],
     "CPA_001":   ["is_consumer_contract", "has_one_sided_clause"],
 }
+# ── Risk-level thresholds (single source of truth) ──────────────────────────
+RISK_LOW_MAX    = 0.50    # < 0.50 → Low
+RISK_MEDIUM_MAX = 0.80    # 0.50–0.80 → Medium; > 0.80 → High
+def level_from_score(score: float) -> tuple[str, str]:
+    """Return (level_label, emoji) for a fused score under the v5.3 thresholds."""
+    if score < RISK_LOW_MAX:
+        return "Low",    "🟢"
+    if score <= RISK_MEDIUM_MAX:
+        return "Medium", "🟡"
+    return "High", "🔴"
 def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
+    """Evaluate symbolic rules. Score is clamped to [0, 1]."""
     triggered, total = [], 0.0
     for rule in symbolic_rules:
         try:
                 triggered.append(rule)
                 total += rule["penalty"]
         except Exception:
             pass
     return {
         "symbolic_score":  round(min(total, 1.0), 3),
     is_ip_clause: bool = False,
 ) -> dict:
     """
+    Weighted fusion — neural-dominant by design.
+    No artificial floor: a weak symbolic trigger no longer inflates risk.
     """
+    # Default neural-dominant. IP clauses give a bit more weight to symbolic,
+    # but symbolic NEVER outweighs neural.
     if is_ip_clause and symbolic > 0:
         w_n, w_s = 0.60, 0.40
+    else:
+        w_n, w_s = 0.75, 0.25
     raw   = w_n * neural + w_s * symbolic
+    score = round(min(max(raw, 0.0), 1.0), 3)
+    level, emoji = level_from_score(score)
     formula = (
         f"({w_n:.2f} × {neural:.3f}) + ({w_s:.2f} × {symbolic:.3f}) "
         f"= {round(raw, 3)}"
     )
     return {
             "symbolic_score": round(symbolic, 3),
             "weights":        {"neural": w_n, "symbolic": w_s},
             "raw_fused":      round(raw, 3),
+            "floor_applied":  False,   # retained for UI compat; always False now
             "final":          score,
             "formula":        formula,
         },
     neural_loaded: bool = True,
 ) -> dict:
     """
+    Three-factor confidence calibrated for the new thresholds.
+      boundary_dist  – distance from the nearest risk-level cutoff (0.50, 0.80)
+      agreement      – 1 - |neural - symbolic|  (only when neural is loaded)
+      rule_strength  – more triggered rules ⇒ stronger deterministic evidence
     """
+    boundary_dist = min(abs(fused - RISK_LOW_MAX), abs(fused - RISK_MEDIUM_MAX))
+    dist_factor   = min(boundary_dist / 0.20, 1.0)
     if neural_loaded:
         agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)
     else:
+        agree_factor = 0.5
     if   num_triggered == 0: rule_factor = 0.40
     elif num_triggered == 1: rule_factor = 0.70
     else:                    rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)

local_interpreters.py CHANGED Viewed

@@ -1,10 +1,10 @@
 # local_interpreters.py
-# Lightweight local explainability:
-#   • LIME text explanations (capped at 50 perturbations for CPU sanity)
-#   • Longformer attention extraction (one extra forward pass, runs only
-#     when a clause is opened — never during batch analysis)
 from __future__ import annotations
 import time
 import numpy as np
 import torch
@@ -16,15 +16,65 @@ except Exception:
     _LIME_AVAILABLE = False
 class LocalExplainer:
-    """
-    LIME + attention extraction.
-    Methods:
-      • explain_with_lime(text, predict_fn) -> list[{word, weight}]
-      • get_attention_map(text, model, tokenizer) -> list[{token, weight}]
-    """
     def __init__(self, num_samples: int = 50, timeout_seconds: float = 30.0):
         self.num_samples     = num_samples
         self.timeout_seconds = timeout_seconds
@@ -44,38 +94,43 @@ class LocalExplainer:
         self,
         text: str,
         predict_fn,
-        num_features: int = 10,
     ) -> list[dict]:
         """
-        predict_fn(list_of_texts) -> np.ndarray shape (n_texts, 2)
-            column 0 = "safe" prob (1 - risk_score)
-            column 1 = "risky" prob (risk_score)
-        Returns a sorted list of {word, weight} (positive = pushes toward risky).
         """
         if not _LIME_AVAILABLE or self._lime is None:
             return []
         try:
             t0 = time.time()
             exp = self._lime.explain_instance(
-                text_instance=text[:1500],      # keep LIME bounded
                 classifier_fn=predict_fn,
                 num_features=num_features,
                 num_samples=self.num_samples,
                 labels=(1,),
             )
             elapsed = time.time() - t0
-            print(f"[INFO] LIME completed in {elapsed:.1f}s "
-                  f"(samples={self.num_samples}, features={num_features})")
-            return [
-                {"word": w, "weight": round(float(s), 4)}
-                for w, s in exp.as_list(label=1)
             ]
         except Exception as e:
             self.last_error = str(e)
             print(f"[WARN] LIME failed: {e}")
             return []
-    # ── Attention ──────────────────────────────────────────────────────────
     def get_attention_map(
         self,
         text: str,
@@ -84,12 +139,6 @@ class LocalExplainer:
         max_length: int = 256,
         top_k: int = 30,
     ) -> list[dict]:
-        """
-        Extract Longformer last-layer global attention from CLS over the
-        sequence, average over heads, return top_k tokens by weight.
-        Runs a second forward pass with output_attentions=True (memory cost
-        is bounded because we only do this for a single clause on demand).
-        """
         if model is None or tokenizer is None:
             return []
         try:
@@ -100,8 +149,6 @@ class LocalExplainer:
             input_ids       = enc["input_ids"]
             attention_mask  = enc["attention_mask"]
-            # Force the CLS token (position 0) to use global attention so we
-            # get a proper distribution over the whole sequence.
             global_mask = torch.zeros_like(input_ids)
             global_mask[:, 0] = 1
@@ -113,20 +160,16 @@ class LocalExplainer:
                     output_attentions      = True,
                 )
-            # Longformer exposes global_attentions when global tokens exist.
-            # Shape: tuple of (batch, num_heads, num_global_tokens, seq_len)
             if not getattr(out, "global_attentions", None):
                 return []
-            last_global = out.global_attentions[-1]            # last layer
-            cls_attn    = last_global[0, :, 0, :].mean(dim=0)   # avg heads
-            # Trim to real (non-pad) tokens
-            n_real = int(attention_mask[0].sum().item())
-            tokens = tokenizer.convert_ids_to_tokens(input_ids[0][:n_real])
             weights = cls_attn[:n_real].cpu().numpy()
-            # Skip special tokens for the ranking, but keep them in the sequence
             specials = {tokenizer.cls_token, tokenizer.sep_token,
                         tokenizer.pad_token, tokenizer.bos_token,
                         tokenizer.eos_token, tokenizer.unk_token}
@@ -145,15 +188,11 @@ class LocalExplainer:
             return []
-# ── Helper: build a LIME-compatible predict_fn from ModelManager ────────────
 def build_predict_fn_for_manager(manager, max_length: int = 256):
-    """
-    Returns a callable LIME can use: list[str] -> np.ndarray (n, 2).
-    Uses the manager's model + tokenizer. Falls back to symbolic if the
-    neural model isn't loaded (each text just gets [1-sym, sym]).
-    """
     def predict_fn(texts: list[str]) -> np.ndarray:
-        # Symbolic-only fallback path
         if not (manager.is_ready and manager.model is not None):
             from inference import _symbolic_rule_score
             from app import SYMBOLIC_RULES
@@ -164,7 +203,6 @@ def build_predict_fn_for_manager(manager, max_length: int = 256):
                 probs.append([1.0 - sym, sym])
             return np.array(probs, dtype=np.float32)
-        # Neural path — batch through Longformer
         enc = manager.tokenizer(
             list(texts),
             padding="max_length", truncation=True,

 # local_interpreters.py
+# v5.3 — LIME output filtered for legal interpretability.
+# The neural model still sees ORIGINAL text. Filtering happens at the
+# display layer only.
 from __future__ import annotations
+import re
 import time
 import numpy as np
 import torch
     _LIME_AVAILABLE = False
+# ─────────────────────────────────────────────────────────────────────────────
+# Token filtering — display-time only
+# ─────────────────────────────────────────────────────────────────────────────
+LIME_STOPWORDS = {
+    "a", "an", "the",
+    "of", "in", "on", "at", "to", "for", "by", "with", "from",
+    "and", "or", "but",
+    "this", "that", "these", "those",
+    "it", "its",
+    "be", "is", "are", "was", "were", "been", "being",
+    "have", "has", "had", "do", "does", "did",
+    "as", "if", "so", "than", "then",
+    "any", "all", "such", "no",   # ambiguous but mostly noise here
+    "i", "we", "you", "they", "he", "she",
+}
+# Legal modal / operative words — never filter these even if they look small
+LIME_KEEP_LEGAL = {
+    "shall", "may", "must", "not", "only", "unless", "except", "without",
+    "subject", "liable", "liability", "indemnify", "indemnity", "indemnification",
+    "terminate", "termination", "exclusive", "exclusively", "exclusivity",
+    "warrant", "warranty", "breach", "obligation", "covenant", "license",
+    "licence", "damages", "consent", "notice", "renew", "renewal",
+    "arbitration", "arbitrator", "jurisdiction", "wager", "gambling",
+    "assign", "assignment", "limit", "cap", "uncapped", "unlimited",
+    "confidential", "disclose", "non-compete", "non-solicit",
+}
+# Roman numeral regex (used for filtering things like "IV", "xii")
+_ROMAN_NUMERAL = re.compile(r"^[ivxlcdm]+\.?$", re.IGNORECASE)
+def _clean_token(raw: str) -> str:
+    """Strip leading/trailing punctuation; return lowercased core."""
+    return re.sub(r'^[^\w]+|[^\w]+$', '', raw).lower()
+def _is_useful_lime_token(raw: str) -> bool:
+    core = _clean_token(raw)
+    if not core:
+        return False
+    if core in LIME_KEEP_LEGAL:
+        return True
+    if core in LIME_STOPWORDS:
+        return False
+    if core.isdigit():
+        return False
+    if _ROMAN_NUMERAL.fullmatch(core):
+        return False
+    # Need at least 2 alphanumeric chars to be a meaningful word
+    if sum(c.isalnum() for c in core) < 2:
+        return False
+    return True
+# ─────────────────────────────────────────────────────────────────────────────
+# LocalExplainer
+# ─────────────────────────────────────────────────────────────────────────────
 class LocalExplainer:
     def __init__(self, num_samples: int = 50, timeout_seconds: float = 30.0):
         self.num_samples     = num_samples
         self.timeout_seconds = timeout_seconds
         self,
         text: str,
         predict_fn,
+        num_features: int = 25,     # raw — over-request, filter later
+        display_count: int = 12,
     ) -> list[dict]:
         """
+        Returns up to `display_count` filtered token contributions, sorted by
+        absolute weight. The model itself still sees the original full text.
         """
         if not _LIME_AVAILABLE or self._lime is None:
             return []
         try:
             t0 = time.time()
             exp = self._lime.explain_instance(
+                text_instance=text[:1500],
                 classifier_fn=predict_fn,
                 num_features=num_features,
                 num_samples=self.num_samples,
                 labels=(1,),
             )
             elapsed = time.time() - t0
+            raw_pairs = exp.as_list(label=1)
+            # Filter for legal interpretability
+            filtered: list[tuple[str, float]] = [
+                (w, float(s)) for w, s in raw_pairs if _is_useful_lime_token(w)
             ]
+            filtered.sort(key=lambda x: abs(x[1]), reverse=True)
+            top = filtered[:display_count]
+            print(f"[INFO] LIME {elapsed:.1f}s, raw={len(raw_pairs)}, "
+                  f"filtered={len(filtered)}, displayed={len(top)}")
+            return [{"word": w, "weight": round(s, 4)} for w, s in top]
         except Exception as e:
             self.last_error = str(e)
             print(f"[WARN] LIME failed: {e}")
             return []
+    # ── Attention (unchanged) ──────────────────────────────────────────────
     def get_attention_map(
         self,
         text: str,
         max_length: int = 256,
         top_k: int = 30,
     ) -> list[dict]:
         if model is None or tokenizer is None:
             return []
         try:
             input_ids       = enc["input_ids"]
             attention_mask  = enc["attention_mask"]
             global_mask = torch.zeros_like(input_ids)
             global_mask[:, 0] = 1
                     output_attentions      = True,
                 )
             if not getattr(out, "global_attentions", None):
                 return []
+            last_global = out.global_attentions[-1]
+            cls_attn    = last_global[0, :, 0, :].mean(dim=0)
+            n_real  = int(attention_mask[0].sum().item())
+            tokens  = tokenizer.convert_ids_to_tokens(input_ids[0][:n_real])
             weights = cls_attn[:n_real].cpu().numpy()
             specials = {tokenizer.cls_token, tokenizer.sep_token,
                         tokenizer.pad_token, tokenizer.bos_token,
                         tokenizer.eos_token, tokenizer.unk_token}
             return []
+# ─────────────────────────────────────────────────────────────────────────────
+# Predict-fn factory (unchanged)
+# ─────────────────────────────────────────────────────────────────────────────
 def build_predict_fn_for_manager(manager, max_length: int = 256):
     def predict_fn(texts: list[str]) -> np.ndarray:
         if not (manager.is_ready and manager.model is not None):
             from inference import _symbolic_rule_score
             from app import SYMBOLIC_RULES
                 probs.append([1.0 - sym, sym])
             return np.array(probs, dtype=np.float32)
         enc = manager.tokenizer(
             list(texts),
             padding="max_length", truncation=True,

pdf_utils.py CHANGED Viewed

@@ -1,157 +1,188 @@
 # pdf_utils.py
-# PDF text extraction + advanced clause segmentation for ClauseXplain
 from __future__ import annotations
 import re
-def extract_text_from_pdf(file_path: str) -> str:
-    """
-    Extract and clean text from all pages of a PDF using PyMuPDF.
-    """
-    import fitz  # pymupdf
-    doc = fitz.open(file_path)
-    pages = []
-    for page in doc:
-        text = page.get_text("text")
-        pages.append(text)
     doc.close()
     raw = "\n".join(pages)
-    # Normalize whitespace while preserving structure
-    raw = re.sub(r"\r\n", "\n", raw)
-    raw = re.sub(r"[ \t]+", " ", raw)
-    raw = re.sub(r"\n{3,}", "\n\n", raw)
     return raw.strip()
-def split_into_clauses(text: str, min_length: int = 40) -> list[str]:
     """
-    Backward-compatible clause splitter.
-    Returns:
-        list[str]
     """
-    clauses = split_into_clauses_with_metadata(text, min_length)
-    return [c["text"] for c in clauses]
-def split_into_clauses_with_metadata(text: str, min_length: int = 40) -> list[dict]:
-    """
-    Advanced legal clause segmentation with metadata.
-    Detects:
-    1. Numbered clauses      → 1., 1.1, 2.3.4
-    2. Articles / Sections   → Article 5, Section 7
-    3. Lettered clauses      → (a), (b), (c)
-    4. ALL CAPS headings     → GOVERNING LAW
-    5. Paragraph fallback
-    Returns:
-    [
         {
-            "text": "...",
-            "number": "1.2",
-            "kind": "numbered"
         }
     ]
-    """
-    if not text or not text.strip():
-        return []
-    text = text.strip()
-    numbered_pattern = re.compile(
-        r'(?m)^(?=\s*(?:\d+(?:\.\d+)*\.?|Article\s+\d+|Section\s+\d+))',
-        re.IGNORECASE
-    )
-    lettered_pattern = re.compile(
-        r'(?m)^(?=\s*\([a-z]\))'
-    )
-    caps_pattern = re.compile(
-        r'(?m)^(?=\s*[A-Z][A-Z\s]{4,}$)'
-    )
-    def extract_clause_number(clause: str, kind: str):
-        if kind == "numbered":
-            match = re.match(
-                r'^\s*(\d+(?:\.\d+)*\.?|Article\s+\d+|Section\s+\d+)',
-                clause,
-                re.IGNORECASE
             )
-            return match.group(1) if match else None
-        elif kind == "lettered":
-            match = re.match(r'^\s*(\([a-z]\))', clause)
-            return match.group(1) if match else None
-        return None
-    def build_metadata(parts: list[str], kind: str) -> list[dict]:
-        results = []
-        for part in parts:
-            clause = part.strip()
-            if len(clause) < min_length:
-                continue
-            results.append({
-                "text": clause,
-                "number": extract_clause_number(clause, kind),
-                "kind": kind
-            })
-        return results
-    # Strategy 1: numbered clauses
-    numbered_parts = [p.strip() for p in numbered_pattern.split(text) if p.strip()]
-    if len(numbered_parts) > 1:
-        result = build_metadata(numbered_parts, "numbered")
-        if len(result) > 1:
-            return result
-    # Strategy 2: lettered clauses
-    lettered_parts = [p.strip() for p in lettered_pattern.split(text) if p.strip()]
-    if len(lettered_parts) > 1:
-        result = build_metadata(lettered_parts, "lettered")
-        if len(result) > 1:
-            return result
-    # Strategy 3: ALL CAPS headings
-    caps_parts = [p.strip() for p in caps_pattern.split(text) if p.strip()]
-    if len(caps_parts) > 1:
-        result = build_metadata(caps_parts, "caps_heading")
-        if len(result) > 1:
-            return result
-    # Strategy 4: paragraph fallback
-    paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
-    fallback = []
-    for para in paragraphs:
-        if len(para) >= min_length:
-            fallback.append({
-                "text": para,
-                "number": None,
-                "kind": "paragraph"
-            })
-    # Final fallback: whole document as one clause
-    if not fallback and len(text) >= min_length:
-        fallback.append({
-            "text": text,
-            "number": None,
-            "kind": "full_text"
-        })
-    return fallback

 # pdf_utils.py
+# v5.3 — finer-grained clause segmentation.
+# Adds:
+#   • Inline subclause splitting for long clauses (a), (b), (c), (i), (ii) …
+#   • Hard length cap with sentence-boundary fallback
 from __future__ import annotations
 import re
+LONG_CLAUSE_CHARS = 1200
+MAX_CLAUSE_CHARS  = 3000
+MIN_SUBCLAUSE_LEN = 60
+# ─────────────────────────────────────────────────────────────────────────────
+# PDF extraction (unchanged)
+# ─────────────────────────────────────────────────────────────────────────────
+def extract_text_from_pdf(file_path: str) -> str:
+    import fitz
+    doc   = fitz.open(file_path)
+    pages = [page.get_text("text") for page in doc]
     doc.close()
     raw = "\n".join(pages)
+    raw = re.sub(r'\r\n', '\n', raw)
+    raw = re.sub(r'[ \t]+', ' ', raw)
+    raw = re.sub(r'\n{3,}', '\n\n', raw)
+    raw = re.sub(r'(\w)-\n(\w)', r'\1\2', raw)
     return raw.strip()
+# ─────────────────────────────────────────────────────────────────────────────
+# Header detection (primary segmentation)
+# ─────────────────────────────────────────────────────────────────────────────
+_HEADER_PATTERNS: list[tuple[str, re.Pattern]] = [
+    ("decimal",  re.compile(r'(?m)^\s*(\d+(?:\.\d+){0,3}\.?)\s+(?=\S)')),
+    ("article",  re.compile(
+        r'(?m)^\s*((?:Article|Section|Clause|Schedule|Annexure|Annex|Appendix|Part|Chapter)'
+        r'\s+(?:\d+(?:\.\d+){0,2}|[IVXLC]+))[\s\.\-:]', re.IGNORECASE)),
+    ("lettered", re.compile(r'(?m)^\s*(\(\s*[a-zA-Z]{1,4}\s*\))\s+(?=\S)')),
+    ("roman",    re.compile(r'(?m)^\s*([IVX]{1,5}\.)\s+(?=\S)')),
+    ("caps",     re.compile(r'(?m)^([A-Z][A-Z0-9 &/\-]{4,59})\s*$')),
+]
+# Inline subclause markers — used in the SECOND pass (mid-text, not line-start)
+_INLINE_SUBCLAUSE = re.compile(
+    r'(?<=[\s\.\;\:])(\(\s*(?:[a-z]|[ivx]{1,4})\s*\))\s+(?=[A-Z\w])',
+    re.IGNORECASE,
+)
+def _collect_headers(text: str) -> list[tuple[int, str, str]]:
+    hits: list[tuple[int, str, str]] = []
+    for kind, pat in _HEADER_PATTERNS:
+        for m in pat.finditer(text):
+            hits.append((m.start(1), m.group(1).strip(), kind))
+    hits.sort(key=lambda h: h[0])
+    deduped: list[tuple[int, str, str]] = []
+    for h in hits:
+        if not deduped or abs(h[0] - deduped[-1][0]) > 2:
+            deduped.append(h)
+    return deduped
+# ─────────────────────────────────────────────────────────────────────────────
+# Inline subclause post-processing
+# ─────────────────────────────────────────────────────────────────────────────
+def _split_inline_subclauses(
+    body: str,
+    parent_number: str | None = None,
+    min_length: int = MIN_SUBCLAUSE_LEN,
+) -> list[dict]:
     """
+    If the clause body contains ≥ 2 inline subclause markers, split it.
+    Subclause numbers are prefixed with the parent (e.g. "5.7" + "(a)" → "5.7(a)").
+    Returns [] if no useful split is possible (caller keeps the original).
     """
+    matches = list(_INLINE_SUBCLAUSE.finditer(body))
+    if len(matches) < 2:
+        return []
+    parts: list[dict] = []
+    # First chunk: text before the first marker (usually the parent header line)
+    head = body[:matches[0].start()].strip()
+    if head and len(head) >= 30:
+        parts.append({
+            "text":   head,
+            "number": parent_number,
+            "kind":   "decimal" if parent_number else "paragraph",
+        })
+    for i, m in enumerate(matches):
+        start = m.start()
+        end   = matches[i + 1].start() if i + 1 < len(matches) else len(body)
+        chunk = body[start:end].strip()
+        if len(chunk) < min_length:
+            # Too short to be a real subclause — fold into previous
+            if parts:
+                parts[-1]["text"] = (parts[-1]["text"] + "\n" + chunk).strip()
+            continue
+        sub_marker = m.group(1).strip()
+        composite  = f"{parent_number}{sub_marker}" if parent_number else sub_marker
+        parts.append({
+            "text":   chunk,
+            "number": composite,
+            "kind":   "subclause",
+        })
+    return parts
+def _hard_cap_split(clause: dict, max_len: int = MAX_CLAUSE_CHARS) -> list[dict]:
+    """If still too long, split on sentence boundaries to bound LM input."""
+    body = clause["text"]
+    if len(body) <= max_len:
+        return [clause]
+    sentences = re.split(r'(?<=[\.\?\!])\s+(?=[A-Z])', body)
+    chunks, current = [], ""
+    for s in sentences:
+        if len(current) + len(s) + 1 > max_len and current:
+            chunks.append(current.strip())
+            current = s
+        else:
+            current = (current + " " + s).strip() if current else s
+    if current:
+        chunks.append(current.strip())
+    return [
         {
+            "text":   c,
+            "number": clause.get("number"),
+            "kind":   clause.get("kind", "paragraph") + "/chunked",
         }
+        for c in chunks if len(c) >= MIN_SUBCLAUSE_LEN
     ]
+# ─────────────────────────────────────────────────────────────────────────────
+# Public API
+# ─────────────────────────────────────────────────────────────────────────────
+def split_into_clauses_with_metadata(
+    text: str,
+    min_length: int = 40,
+) -> list[dict]:
+    headers = _collect_headers(text)
+    # ── Primary segmentation (heading-based) ───────────────────────────────
+    primary: list[dict] = []
+    if headers:
+        for i, (start, marker, kind) in enumerate(headers):
+            end  = headers[i + 1][0] if i + 1 < len(headers) else len(text)
+            body = text[start:end].strip()
+            if len(body) >= min_length:
+                primary.append({"text": body, "number": marker, "kind": kind})
+    # Paragraph fallback when no headers were found
+    if not primary:
+        for p in [p.strip() for p in re.split(r'\n\s*\n', text)]:
+            if len(p) >= min_length:
+                primary.append({"text": p, "number": None, "kind": "paragraph"})
+    # ── Secondary pass: inline subclause splitting for long clauses ────────
+    refined: list[dict] = []
+    for clause in primary:
+        if len(clause["text"]) > LONG_CLAUSE_CHARS:
+            subs = _split_inline_subclauses(
+                clause["text"],
+                parent_number=clause.get("number"),
             )
+            if subs:
+                refined.extend(subs)
+                continue
+        refined.append(clause)
+    # ── Tertiary pass: hard length cap (sentence-boundary chunking) ────────
+    final: list[dict] = []
+    for clause in refined:
+        final.extend(_hard_cap_split(clause))
+    return final
+def split_into_clauses(text: str, min_length: int = 40) -> list[str]:
+    """Backward-compat wrapper that returns plain strings."""
+    return [c["text"] for c in split_into_clauses_with_metadata(text, min_length)]