riyasuryawanshi746 commited on
Commit
af910e9
Β·
verified Β·
1 Parent(s): 92625e7

Explanability and Symbolic part fixed 12th may

Browse files
Files changed (6) hide show
  1. app.py +40 -68
  2. explanation.py +13 -53
  3. feature_extractor.py +125 -107
  4. inference.py +40 -66
  5. local_interpreters.py +86 -48
  6. pdf_utils.py +159 -128
app.py CHANGED
@@ -1,7 +1,10 @@
1
  # app.py
2
- # ClauseXplain v5.2 β€” full explainability stack
3
- # Integrates: feature_extractor, nl_summary, local_interpreters,
4
- # attention_visualization, report
 
 
 
5
 
6
  from __future__ import annotations
7
  import os
@@ -18,10 +21,10 @@ from transformers import LongformerTokenizer, LongformerModel
18
  from sklearn.preprocessing import MultiLabelBinarizer
19
  from huggingface_hub import hf_hub_download
20
 
21
- # ── New modules ───────────────────────────────────────────────────────────────
22
  from feature_extractor import ClauseFeatureExtractor
23
  from explanation import generate_explanation
24
- from utils import highlight_keywords # (others unused)
 
25
 
26
  # ── Optional / fail-soft integrations ─────────────────────────────────────────
27
  try:
@@ -52,10 +55,8 @@ except Exception as _e:
52
  print(f"[WARN] report disabled: {_e}")
53
  generate_report = None
54
 
55
- # ── Device β€” always CPU on HF free tier ───────────────────────────────────────
56
  DEVICE = torch.device("cpu")
57
 
58
- # ── Label sets (unchanged) ────────────────────────────────────────────────────
59
  CLAUSE_CLASSES = [
60
  "Cap On Liability", "Change Of Control", "Covenant Not To Sue",
61
  "Exclusivity", "Governing Law", "IP Ownership Assignment",
@@ -76,7 +77,7 @@ CLAUSE_CLASSES = [
76
  RISK_CLASSES = ["ambiguity", "enforceability", "financial", "ip", "structural"]
77
 
78
  # ─────────────────────────────────────────────────────────────────────────────
79
- # Symbolic rules β€” original 12 + 5 newly added
80
  # ─────────────────────────────────────────────────────────────────────────────
81
  SYMBOLIC_RULES = [
82
  {"rule_id": "ICA_001", "name": "Unconscionable Liability Cap",
@@ -91,21 +92,28 @@ SYMBOLIC_RULES = [
91
  {"rule_id": "ICA_004", "name": "Penalty Clause Exceeds Actual Damage",
92
  "reference": "Indian Contract Act 1872, S.74", "penalty": 0.40, "category": "financial",
93
  "condition": lambda f: f.get("has_liquidated_damages") and f.get("damages_exceed_loss")},
94
- {"rule_id": "ICA_005", "name": "Wagering / Contingency Clause",
 
95
  "reference": "Indian Contract Act 1872, S.30", "penalty": 0.70, "category": "enforceability",
96
  "condition": lambda f: f.get("is_wagering_clause")},
97
  {"rule_id": "ICA_006", "name": "Restraint of Legal Proceedings",
98
  "reference": "Indian Contract Act 1872, S.28", "penalty": 0.60, "category": "enforceability",
99
  "condition": lambda f: f.get("restrains_legal_proceedings")},
 
100
  {"rule_id": "ICA_007", "name": "Uncapped Indemnity Obligation",
101
  "reference": "Indian Contract Act 1872, S.124", "penalty": 0.50, "category": "financial",
102
- "condition": lambda f: f.get("has_indemnity_clause") and not f.get("indemnity_capped")},
 
 
 
 
103
  {"rule_id": "ICA_008", "name": "Auto-Renewal Without Opt-Out Window",
104
  "reference": "Indian Contract Act 1872 + CPA 2019", "penalty": 0.35, "category": "enforceability",
105
  "condition": lambda f: f.get("has_auto_renewal") and not f.get("has_opt_out_window")},
106
  {"rule_id": "ICA_009", "name": "Arbitration in Distant Venue",
107
  "reference": "Arbitration and Conciliation Act 1996, S.20", "penalty": 0.40, "category": "enforceability",
108
  "condition": lambda f: f.get("has_arbitration") and f.get("arbitration_distant_venue")},
 
109
  {"rule_id": "ICA_010", "name": "Indefinite Exclusivity",
110
  "reference": "Indian Contract Act 1872, S.27", "penalty": 0.50, "category": "enforceability",
111
  "condition": lambda f: f.get("has_exclusivity") and not f.get("exclusivity_term_defined")},
@@ -134,7 +142,7 @@ SYMBOLIC_RULES = [
134
 
135
 
136
  # ─────────────────────────────────────────────────────────────────────────────
137
- # Model definition (unchanged)
138
  # ─────────────────────────────────────────────────────────────────────────────
139
  class ClauseXplainV5(nn.Module):
140
  def __init__(self, num_clause_labels: int, num_risk_labels: int):
@@ -211,7 +219,6 @@ class ModelManager:
211
  repo_id="riyasuryawanshi746/clauseXplain",
212
  filename="clausexplain_v5_best.pt",
213
  )
214
- print(f"[INFO] Checkpoint at: {ckpt_path}")
215
  checkpoint = torch.load(
216
  ckpt_path,
217
  map_location=torch.device("cpu"),
@@ -301,7 +308,7 @@ class ModelManager:
301
  "top_risk_cats": top_risks,
302
  "triggered_rules": triggered_clean,
303
  "features": {k: v for k, v in features.items() if v},
304
- "evidence": evidence, # nested dict (kept for explanation engine)
305
  "score_breakdown": fusion["breakdown"],
306
  "confidence": confidence,
307
  }
@@ -333,9 +340,8 @@ class ModelManager:
333
 
334
  scores = [r["risk_score"] for r in results]
335
  overall = round(0.70 * max(scores) + 0.30 * (sum(scores) / len(scores)), 3)
336
- if overall <= 0.33: level = "Low"
337
- elif overall <= 0.66: level = "Medium"
338
- else: level = "High"
339
  return {
340
  "overall_risk": overall,
341
  "overall_level": level,
@@ -348,7 +354,7 @@ class ModelManager:
348
  manager = ModelManager()
349
 
350
  # ═══════════════════════════════════════════════════════════════════════════════
351
- # UI helpers
352
  # ═══════════════════════════════════════════════════════════════════════════════
353
  LEVEL_COLOR = {"Low": "🟒", "Medium": "🟑", "High": "πŸ”΄"}
354
  LEVEL_HEX = {"Low": "#10b981", "Medium": "#f59e0b", "High": "#ef4444"}
@@ -412,8 +418,6 @@ def _score_breakdown_html(breakdown) -> str:
412
  if not breakdown:
413
  return ""
414
  w = breakdown["weights"]
415
- floor_note = ('<div class="cx-bd-floor">βš“ Floor 0.30 applied β€” symbolic rules fired</div>'
416
- if breakdown.get("floor_applied") else "")
417
  return f"""
418
  <div class="cx-breakdown">
419
  <div class="cx-bd-row">
@@ -432,12 +436,11 @@ def _score_breakdown_html(breakdown) -> str:
432
  <span class="cx-bd-k">Final</span>
433
  <span class="cx-bd-final-v">{breakdown['final']:.3f}</span>
434
  </div>
435
- {floor_note}
436
  </div>"""
437
 
438
 
439
  # ═══════════════════════════════════════════════════════════════════════════════
440
- # Analysis flow
441
  # ═══════════════════════════════════════════════════════════════════════════════
442
  def _run_analysis(text: str):
443
  if not text or len(text.strip()) < 30:
@@ -505,7 +508,6 @@ def _build_outputs(text: str):
505
  </div>
506
  {model_note}"""
507
 
508
- # Top cards
509
  top_parts = ['<div class="cx-section-title">πŸ”₯ Top Risk Clauses</div>',
510
  '<div class="cx-top-grid">']
511
  for r in doc["top_risks"]:
@@ -537,7 +539,6 @@ def _build_outputs(text: str):
537
  top_parts.append("</div>")
538
  top_html = "\n".join(top_parts)
539
 
540
- # Markdown breakdown table
541
  rows = [
542
  "## πŸ“„ All Clauses\n",
543
  "| # | Marker | Level | Score | Confidence | Symbolic | Preview |",
@@ -565,14 +566,10 @@ def _build_outputs(text: str):
565
  f"{r['clause_text'][:55].replace(chr(10), ' ')}…"
566
  for r in doc["clauses"]
567
  ]
568
- # PDF download button: visible only after a successful analysis
569
  pdf_update = gr.update(visible=True, value=None)
570
  return summary_html, top_html, breakdown_md, gr.update(choices=clause_choices, value=None), doc, pdf_update
571
 
572
 
573
- # ────────────────��────────────────────────────────────────────────────────────
574
- # Clause explanation panel β€” lazy-runs Gemini + LIME + attention here
575
- # ─────────────────────────────────────────────────────────────────────────────
576
  def show_clause_explanation(choice: str, doc_state: dict):
577
  if not choice or not doc_state:
578
  return '<div class="cx-empty">← Select a clause above to see its full legal analysis.</div>'
@@ -584,27 +581,22 @@ def show_clause_explanation(choice: str, doc_state: dict):
584
 
585
  explanation = generate_explanation(r["clause_text"], r)
586
 
587
- # ── Gemini summary (cached) ────────────────────────────────────────────
588
  if nl_summarizer is not None:
589
  nl_text = nl_summarizer.generate_summary(explanation, r["clause_text"])
590
  explanation["natural_language_summary"] = nl_text
591
- # Persist so the PDF report can include it
592
  r["nl_summary"] = nl_text
593
  else:
594
  explanation["natural_language_summary"] = ""
595
 
596
- # ── LIME (lazy, bounded) ───────────────────────────────────────────────
597
  lime_words = []
598
  if local_explainer is not None and build_predict_fn_for_manager is not None:
599
  try:
600
  manager.ensure_loaded()
601
  predict_fn = build_predict_fn_for_manager(manager)
602
- lime_words = local_explainer.explain_with_lime(r["clause_text"], predict_fn,
603
- num_features=10)
604
  except Exception as e:
605
  print(f"[WARN] LIME path failed: {e}")
606
 
607
- # ── Attention map (lazy, bounded) ──────────────────────────────────────
608
  attn_tokens = []
609
  if local_explainer is not None and manager.is_ready:
610
  try:
@@ -614,7 +606,6 @@ def show_clause_explanation(choice: str, doc_state: dict):
614
  except Exception as e:
615
  print(f"[WARN] Attention path failed: {e}")
616
 
617
- # ── Render ─────────────────────────────────────────────────────────────
618
  lvl = r["risk_level_raw"]
619
  color = LEVEL_HEX.get(lvl, "#6b7280")
620
  cpct = int(r["risk_score"] * 100)
@@ -631,7 +622,6 @@ def show_clause_explanation(choice: str, doc_state: dict):
631
  f'<div class="cx-pills">{evidence_pills}</div>'
632
  ) if evidence_pills else ""
633
 
634
- # NL summary block
635
  nl_block = ""
636
  nl_text = explanation.get("natural_language_summary", "").strip()
637
  if nl_text:
@@ -642,22 +632,19 @@ def show_clause_explanation(choice: str, doc_state: dict):
642
  f'<div class="cx-nl">{nl_text}</div>'
643
  )
644
 
645
- # LIME + attention blocks
646
  lime_block = ""
647
  if lime_words:
648
- lime_block = (f'<div class="cx-section-label">πŸ§ͺ LIME β€” Token Contributions</div>'
649
  f'{lime_html(lime_words)}')
650
  attn_block = ""
651
  if attn_tokens:
652
  attn_block = (f'<div class="cx-section-label">πŸ‘οΈ Attention Heatmap</div>'
653
  f'{attention_heatmap_html(attn_tokens)}')
654
 
655
- # Score-breakdown plaintext (Riya's "Final Score = ..." string)
656
  bd_text_block = ""
657
  if explanation.get("score_breakdown_text"):
658
  bd_text_block = f'<div class="cx-bd-text">{explanation["score_breakdown_text"]}</div>'
659
 
660
- # Per-rule cards
661
  rules_html = ""
662
  for rule_data in explanation.get("rules") or []:
663
  rid = rule_data["rule_id"]
@@ -725,9 +712,6 @@ def show_clause_explanation(choice: str, doc_state: dict):
725
  </div>"""
726
 
727
 
728
- # ─────────────────────────────────────────────────────────────────────────────
729
- # PDF report download
730
- # ─────────────────────────────────────────────────────────────────────────────
731
  def build_pdf_report(doc_state: dict):
732
  if not doc_state:
733
  return gr.update(visible=False, value=None)
@@ -744,7 +728,6 @@ def build_pdf_report(doc_state: dict):
744
  return gr.update(visible=True, value=None)
745
 
746
 
747
- # ── Examples ──────────────────────────────────────────────────────────────────
748
  EXAMPLES = [
749
  ("⚑ High Risk", """1. Liability Cap
750
  The total liability of either party shall not exceed Rs. 50,000 under any circumstances,
@@ -752,10 +735,7 @@ including gross negligence or wilful misconduct of either party.
752
  2. Non-Compete
753
  Employee shall not compete with the company in any capacity for 3 years following
754
  termination of this agreement, within the territory of India.
755
- 3. Data Processing
756
- The vendor shall collect and process customer personal data as required to fulfil
757
- the services described in Schedule A of this agreement.
758
- 4. Indemnity
759
  The Service Provider shall indemnify and hold harmless the Client against any and all
760
  claims, damages, losses, and expenses arising out of or related to this agreement."""),
761
 
@@ -766,40 +746,35 @@ Any dispute arising out of this agreement shall be referred to arbitration with
766
  the seat of arbitration in Singapore.
767
  3. Pricing
768
  The Company may modify the prices and fees charged under this agreement at
769
- its sole discretion upon written notice to the Customer."""),
770
 
771
  ("🟒 Low Risk", """1. Renewal
772
  This agreement renews automatically every year unless either party provides
773
  30 days written notice before the renewal date.
774
  2. Governing Law
775
  This agreement is governed by the laws of India."""),
 
 
 
 
 
776
  ]
777
 
778
 
779
- # ── CSS (additive over v5.1) ──────────────────────────────────────────────────
780
  CUSTOM_CSS = """
781
  @import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,400&family=DM+Mono:wght@400;500&display=swap');
782
  * { box-sizing: border-box; }
783
  body, .gradio-container { background:#080d1a !important; font-family:'DM Sans',sans-serif !important; color:#e2e8f0 !important; }
784
  footer { display:none !important; }
785
  .gradio-container { max-width:1080px !important; margin:0 auto !important; }
786
- .cx-hero { text-align:center; padding:52px 24px 36px;
787
- background:linear-gradient(135deg,#0f172a 0%,#1a1040 60%,#0f172a 100%);
788
- border-radius:16px; margin-bottom:8px; position:relative; overflow:hidden; }
789
- .cx-hero::before { content:''; position:absolute; inset:0;
790
- background:radial-gradient(ellipse 70% 60% at 50% -10%,#6366f135 0%,transparent 70%);
791
- pointer-events:none; }
792
  .cx-hero-icon { font-size:44px; margin-bottom:14px; }
793
- .cx-hero-title { font-size:38px; font-weight:700; letter-spacing:-.025em;
794
- background:linear-gradient(135deg,#f1f5f9 20%,#a5b4fc 80%);
795
- -webkit-background-clip:text; -webkit-text-fill-color:transparent;
796
- margin:0 0 10px; line-height:1.1; }
797
  .cx-hero-sub { font-size:15px; color:#94a3b8; margin:0 0 22px; font-weight:400; }
798
  .cx-badges { display:flex; gap:8px; flex-wrap:wrap; justify-content:center; }
799
- .cx-badge-hero { font-size:11px; font-weight:600; letter-spacing:.07em; text-transform:uppercase;
800
- padding:5px 12px; border-radius:20px; border:1px solid #2d3a55; background:#131c30; color:#8b9fc7; }
801
- .cx-model-notice { background:#111827; border:1px solid #1e293b; border-radius:10px;
802
- padding:11px 16px; font-size:13px; color:#94a3b8; display:flex; align-items:center; gap:10px; margin-bottom:4px; }
803
  .cx-model-notice strong { color:#a5b4fc; }
804
  .cx-card { background:#111827; border:1px solid #1e293b; border-radius:12px; padding:16px 20px; }
805
  .cx-summary-grid { display:grid; grid-template-columns:180px 1fr 1fr 1fr 1fr; gap:12px; align-items:stretch; margin:4px 0 8px; }
@@ -836,7 +811,6 @@ footer { display:none !important; }
836
  .cx-bd-formula { color:#a5b4fc; font-size:13px; padding:4px 0 8px; }
837
  .cx-bd-final { display:grid; grid-template-columns:80px 1fr; padding-top:4px; }
838
  .cx-bd-final-v { color:#34d399; font-weight:700; font-size:16px; }
839
- .cx-bd-floor { margin-top:8px; font-size:11px; color:#fbbf24; background:#422006; padding:6px 10px; border-radius:6px; font-family:'DM Sans',sans-serif; }
840
  .cx-bd-text { font-family:'DM Mono',monospace; font-size:12px; color:#94a3b8; padding:6px 14px; }
841
  .cx-divider { border:none; border-top:1px solid #1a2332; margin:24px 0; }
842
  .cx-empty { color:#374151; font-size:14px; padding:28px 0; text-align:center; }
@@ -880,7 +854,6 @@ select, .gr-dropdown { background:#0c1525 !important; border-color:#1e293b !impo
880
  """
881
 
882
 
883
- # ── Build UI ───────────────────────────────────────────────────────────────────
884
  def build_ui():
885
  with gr.Blocks(
886
  title="ClauseXplain β€” AI Legal Risk Dashboard",
@@ -898,7 +871,7 @@ def build_ui():
898
  <div class="cx-hero">
899
  <div class="cx-hero-icon">βš–οΈ</div>
900
  <h1 class="cx-hero-title">ClauseXplain</h1>
901
- <p class="cx-hero-sub">AI Legal Risk Analyzer for Indian Contracts</p>
902
  <div class="cx-badges">
903
  <span class="cx-badge-hero">ICA 1872</span>
904
  <span class="cx-badge-hero">DPDPA 2023</span>
@@ -913,7 +886,7 @@ def build_ui():
913
  gr.HTML("""
914
  <div class="cx-model-notice">
915
  ⏳ &nbsp;The neural model (~2 GB) loads on your <strong>first analysis request</strong> β€”
916
- expect 60–90 s. LIME + attention run lazily when you open a clause (~15–25 s).
917
  </div>
918
  """)
919
 
@@ -944,7 +917,6 @@ def build_ui():
944
  with gr.Accordion("πŸ“„ Full Clause Breakdown", open=False):
945
  breakdown_out = gr.Markdown("")
946
 
947
- # ── PDF download ──────────────────────────────────────────────────────
948
  with gr.Row():
949
  pdf_dl_btn = gr.Button("πŸ“₯ Download PDF Report", variant="primary")
950
  pdf_file_out = gr.File(label="Compliance Report", visible=False, interactive=False)
 
1
  # app.py
2
+ # ClauseXplain v5.3 β€” hardening pass
3
+ # Changes vs v5.2:
4
+ # β€’ ICA_007 (uncapped indemnity) now requires has_uncapped_signal β€” no more
5
+ # auto-firing on every "indemnify" mention
6
+ # β€’ analyze_document uses level_from_score() from inference.py (single source
7
+ # of truth for the new 0.50 / 0.80 risk-level cutoffs)
8
 
9
  from __future__ import annotations
10
  import os
 
21
  from sklearn.preprocessing import MultiLabelBinarizer
22
  from huggingface_hub import hf_hub_download
23
 
 
24
  from feature_extractor import ClauseFeatureExtractor
25
  from explanation import generate_explanation
26
+ from inference import level_from_score # v5.3: single source of truth
27
+ from utils import highlight_keywords
28
 
29
  # ── Optional / fail-soft integrations ─────────────────────────────────────────
30
  try:
 
55
  print(f"[WARN] report disabled: {_e}")
56
  generate_report = None
57
 
 
58
  DEVICE = torch.device("cpu")
59
 
 
60
  CLAUSE_CLASSES = [
61
  "Cap On Liability", "Change Of Control", "Covenant Not To Sue",
62
  "Exclusivity", "Governing Law", "IP Ownership Assignment",
 
77
  RISK_CLASSES = ["ambiguity", "enforceability", "financial", "ip", "structural"]
78
 
79
  # ─────────────────────────────────────────────────────────────────────────────
80
+ # Symbolic rules β€” v5.3 tightened
81
  # ─────────────────────────────────────────────────────────────────────────────
82
  SYMBOLIC_RULES = [
83
  {"rule_id": "ICA_001", "name": "Unconscionable Liability Cap",
 
92
  {"rule_id": "ICA_004", "name": "Penalty Clause Exceeds Actual Damage",
93
  "reference": "Indian Contract Act 1872, S.74", "penalty": 0.40, "category": "financial",
94
  "condition": lambda f: f.get("has_liquidated_damages") and f.get("damages_exceed_loss")},
95
+ # ICA_005: only fires on explicit gambling vocab β€” no more "contingent on closing"
96
+ {"rule_id": "ICA_005", "name": "Wagering / Gambling Agreement",
97
  "reference": "Indian Contract Act 1872, S.30", "penalty": 0.70, "category": "enforceability",
98
  "condition": lambda f: f.get("is_wagering_clause")},
99
  {"rule_id": "ICA_006", "name": "Restraint of Legal Proceedings",
100
  "reference": "Indian Contract Act 1872, S.28", "penalty": 0.60, "category": "enforceability",
101
  "condition": lambda f: f.get("restrains_legal_proceedings")},
102
+ # ICA_007 TIGHTENED: indemnity + explicit uncapped signal + no cap
103
  {"rule_id": "ICA_007", "name": "Uncapped Indemnity Obligation",
104
  "reference": "Indian Contract Act 1872, S.124", "penalty": 0.50, "category": "financial",
105
+ "condition": lambda f: (
106
+ f.get("has_indemnity_clause")
107
+ and f.get("has_uncapped_signal")
108
+ and not f.get("indemnity_capped")
109
+ )},
110
  {"rule_id": "ICA_008", "name": "Auto-Renewal Without Opt-Out Window",
111
  "reference": "Indian Contract Act 1872 + CPA 2019", "penalty": 0.35, "category": "enforceability",
112
  "condition": lambda f: f.get("has_auto_renewal") and not f.get("has_opt_out_window")},
113
  {"rule_id": "ICA_009", "name": "Arbitration in Distant Venue",
114
  "reference": "Arbitration and Conciliation Act 1996, S.20", "penalty": 0.40, "category": "enforceability",
115
  "condition": lambda f: f.get("has_arbitration") and f.get("arbitration_distant_venue")},
116
+ # ICA_010 narrowed via tightened has_exclusivity patterns in feature_extractor
117
  {"rule_id": "ICA_010", "name": "Indefinite Exclusivity",
118
  "reference": "Indian Contract Act 1872, S.27", "penalty": 0.50, "category": "enforceability",
119
  "condition": lambda f: f.get("has_exclusivity") and not f.get("exclusivity_term_defined")},
 
142
 
143
 
144
  # ─────────────────────────────────────────────────────────────────────────────
145
+ # Model (unchanged)
146
  # ─────────────────────────────────────────────────────────────────────────────
147
  class ClauseXplainV5(nn.Module):
148
  def __init__(self, num_clause_labels: int, num_risk_labels: int):
 
219
  repo_id="riyasuryawanshi746/clauseXplain",
220
  filename="clausexplain_v5_best.pt",
221
  )
 
222
  checkpoint = torch.load(
223
  ckpt_path,
224
  map_location=torch.device("cpu"),
 
308
  "top_risk_cats": top_risks,
309
  "triggered_rules": triggered_clean,
310
  "features": {k: v for k, v in features.items() if v},
311
+ "evidence": evidence,
312
  "score_breakdown": fusion["breakdown"],
313
  "confidence": confidence,
314
  }
 
340
 
341
  scores = [r["risk_score"] for r in results]
342
  overall = round(0.70 * max(scores) + 0.30 * (sum(scores) / len(scores)), 3)
343
+ # v5.3: single source of truth for thresholds
344
+ level, _ = level_from_score(overall)
 
345
  return {
346
  "overall_risk": overall,
347
  "overall_level": level,
 
354
  manager = ModelManager()
355
 
356
  # ═══════════════════════════════════════════════════════════════════════════════
357
+ # UI helpers (unchanged from v5.2)
358
  # ═══════════════════════════════════════════════════════════════════════════════
359
  LEVEL_COLOR = {"Low": "🟒", "Medium": "🟑", "High": "πŸ”΄"}
360
  LEVEL_HEX = {"Low": "#10b981", "Medium": "#f59e0b", "High": "#ef4444"}
 
418
  if not breakdown:
419
  return ""
420
  w = breakdown["weights"]
 
 
421
  return f"""
422
  <div class="cx-breakdown">
423
  <div class="cx-bd-row">
 
436
  <span class="cx-bd-k">Final</span>
437
  <span class="cx-bd-final-v">{breakdown['final']:.3f}</span>
438
  </div>
 
439
  </div>"""
440
 
441
 
442
  # ═══════════════════════════════════════════════════════════════════════════════
443
+ # Analysis flow (unchanged structurally)
444
  # ═══════════════════════════════════════════════════════════════════════════════
445
  def _run_analysis(text: str):
446
  if not text or len(text.strip()) < 30:
 
508
  </div>
509
  {model_note}"""
510
 
 
511
  top_parts = ['<div class="cx-section-title">πŸ”₯ Top Risk Clauses</div>',
512
  '<div class="cx-top-grid">']
513
  for r in doc["top_risks"]:
 
539
  top_parts.append("</div>")
540
  top_html = "\n".join(top_parts)
541
 
 
542
  rows = [
543
  "## πŸ“„ All Clauses\n",
544
  "| # | Marker | Level | Score | Confidence | Symbolic | Preview |",
 
566
  f"{r['clause_text'][:55].replace(chr(10), ' ')}…"
567
  for r in doc["clauses"]
568
  ]
 
569
  pdf_update = gr.update(visible=True, value=None)
570
  return summary_html, top_html, breakdown_md, gr.update(choices=clause_choices, value=None), doc, pdf_update
571
 
572
 
 
 
 
573
  def show_clause_explanation(choice: str, doc_state: dict):
574
  if not choice or not doc_state:
575
  return '<div class="cx-empty">← Select a clause above to see its full legal analysis.</div>'
 
581
 
582
  explanation = generate_explanation(r["clause_text"], r)
583
 
 
584
  if nl_summarizer is not None:
585
  nl_text = nl_summarizer.generate_summary(explanation, r["clause_text"])
586
  explanation["natural_language_summary"] = nl_text
 
587
  r["nl_summary"] = nl_text
588
  else:
589
  explanation["natural_language_summary"] = ""
590
 
 
591
  lime_words = []
592
  if local_explainer is not None and build_predict_fn_for_manager is not None:
593
  try:
594
  manager.ensure_loaded()
595
  predict_fn = build_predict_fn_for_manager(manager)
596
+ lime_words = local_explainer.explain_with_lime(r["clause_text"], predict_fn)
 
597
  except Exception as e:
598
  print(f"[WARN] LIME path failed: {e}")
599
 
 
600
  attn_tokens = []
601
  if local_explainer is not None and manager.is_ready:
602
  try:
 
606
  except Exception as e:
607
  print(f"[WARN] Attention path failed: {e}")
608
 
 
609
  lvl = r["risk_level_raw"]
610
  color = LEVEL_HEX.get(lvl, "#6b7280")
611
  cpct = int(r["risk_score"] * 100)
 
622
  f'<div class="cx-pills">{evidence_pills}</div>'
623
  ) if evidence_pills else ""
624
 
 
625
  nl_block = ""
626
  nl_text = explanation.get("natural_language_summary", "").strip()
627
  if nl_text:
 
632
  f'<div class="cx-nl">{nl_text}</div>'
633
  )
634
 
 
635
  lime_block = ""
636
  if lime_words:
637
+ lime_block = (f'<div class="cx-section-label">πŸ§ͺ LIME β€” Key Legal Terms Driving Risk</div>'
638
  f'{lime_html(lime_words)}')
639
  attn_block = ""
640
  if attn_tokens:
641
  attn_block = (f'<div class="cx-section-label">πŸ‘οΈ Attention Heatmap</div>'
642
  f'{attention_heatmap_html(attn_tokens)}')
643
 
 
644
  bd_text_block = ""
645
  if explanation.get("score_breakdown_text"):
646
  bd_text_block = f'<div class="cx-bd-text">{explanation["score_breakdown_text"]}</div>'
647
 
 
648
  rules_html = ""
649
  for rule_data in explanation.get("rules") or []:
650
  rid = rule_data["rule_id"]
 
712
  </div>"""
713
 
714
 
 
 
 
715
  def build_pdf_report(doc_state: dict):
716
  if not doc_state:
717
  return gr.update(visible=False, value=None)
 
728
  return gr.update(visible=True, value=None)
729
 
730
 
 
731
  EXAMPLES = [
732
  ("⚑ High Risk", """1. Liability Cap
733
  The total liability of either party shall not exceed Rs. 50,000 under any circumstances,
 
735
  2. Non-Compete
736
  Employee shall not compete with the company in any capacity for 3 years following
737
  termination of this agreement, within the territory of India.
738
+ 3. Indemnity
 
 
 
739
  The Service Provider shall indemnify and hold harmless the Client against any and all
740
  claims, damages, losses, and expenses arising out of or related to this agreement."""),
741
 
 
746
  the seat of arbitration in Singapore.
747
  3. Pricing
748
  The Company may modify the prices and fees charged under this agreement at
749
+ its sole discretion to modify the terms upon written notice."""),
750
 
751
  ("🟒 Low Risk", """1. Renewal
752
  This agreement renews automatically every year unless either party provides
753
  30 days written notice before the renewal date.
754
  2. Governing Law
755
  This agreement is governed by the laws of India."""),
756
+
757
+ ("πŸ§ͺ Benign (M&A-style)", """Compensation paid hereunder shall be exclusive of the Company's
758
+ contributions to statutory benefits. Payment of the closing bonus is
759
+ contingent on the occurrence of the closing of the merger transaction
760
+ and continued employment through such date."""),
761
  ]
762
 
763
 
 
764
  CUSTOM_CSS = """
765
  @import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,400&family=DM+Mono:wght@400;500&display=swap');
766
  * { box-sizing: border-box; }
767
  body, .gradio-container { background:#080d1a !important; font-family:'DM Sans',sans-serif !important; color:#e2e8f0 !important; }
768
  footer { display:none !important; }
769
  .gradio-container { max-width:1080px !important; margin:0 auto !important; }
770
+ .cx-hero { text-align:center; padding:52px 24px 36px; background:linear-gradient(135deg,#0f172a 0%,#1a1040 60%,#0f172a 100%); border-radius:16px; margin-bottom:8px; position:relative; overflow:hidden; }
771
+ .cx-hero::before { content:''; position:absolute; inset:0; background:radial-gradient(ellipse 70% 60% at 50% -10%,#6366f135 0%,transparent 70%); pointer-events:none; }
 
 
 
 
772
  .cx-hero-icon { font-size:44px; margin-bottom:14px; }
773
+ .cx-hero-title { font-size:38px; font-weight:700; letter-spacing:-.025em; background:linear-gradient(135deg,#f1f5f9 20%,#a5b4fc 80%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; margin:0 0 10px; line-height:1.1; }
 
 
 
774
  .cx-hero-sub { font-size:15px; color:#94a3b8; margin:0 0 22px; font-weight:400; }
775
  .cx-badges { display:flex; gap:8px; flex-wrap:wrap; justify-content:center; }
776
+ .cx-badge-hero { font-size:11px; font-weight:600; letter-spacing:.07em; text-transform:uppercase; padding:5px 12px; border-radius:20px; border:1px solid #2d3a55; background:#131c30; color:#8b9fc7; }
777
+ .cx-model-notice { background:#111827; border:1px solid #1e293b; border-radius:10px; padding:11px 16px; font-size:13px; color:#94a3b8; display:flex; align-items:center; gap:10px; margin-bottom:4px; }
 
 
778
  .cx-model-notice strong { color:#a5b4fc; }
779
  .cx-card { background:#111827; border:1px solid #1e293b; border-radius:12px; padding:16px 20px; }
780
  .cx-summary-grid { display:grid; grid-template-columns:180px 1fr 1fr 1fr 1fr; gap:12px; align-items:stretch; margin:4px 0 8px; }
 
811
  .cx-bd-formula { color:#a5b4fc; font-size:13px; padding:4px 0 8px; }
812
  .cx-bd-final { display:grid; grid-template-columns:80px 1fr; padding-top:4px; }
813
  .cx-bd-final-v { color:#34d399; font-weight:700; font-size:16px; }
 
814
  .cx-bd-text { font-family:'DM Mono',monospace; font-size:12px; color:#94a3b8; padding:6px 14px; }
815
  .cx-divider { border:none; border-top:1px solid #1a2332; margin:24px 0; }
816
  .cx-empty { color:#374151; font-size:14px; padding:28px 0; text-align:center; }
 
854
  """
855
 
856
 
 
857
  def build_ui():
858
  with gr.Blocks(
859
  title="ClauseXplain β€” AI Legal Risk Dashboard",
 
871
  <div class="cx-hero">
872
  <div class="cx-hero-icon">βš–οΈ</div>
873
  <h1 class="cx-hero-title">ClauseXplain</h1>
874
+ <p class="cx-hero-sub">International contract neural backbone, localised via Indian neuro-symbolic legal reasoning</p>
875
  <div class="cx-badges">
876
  <span class="cx-badge-hero">ICA 1872</span>
877
  <span class="cx-badge-hero">DPDPA 2023</span>
 
886
  gr.HTML("""
887
  <div class="cx-model-notice">
888
  ⏳ &nbsp;The neural model (~2 GB) loads on your <strong>first analysis request</strong> β€”
889
+ expect 60–90 s. Per-clause LIME + attention run lazily when you inspect a clause (~15–25 s).
890
  </div>
891
  """)
892
 
 
917
  with gr.Accordion("πŸ“„ Full Clause Breakdown", open=False):
918
  breakdown_out = gr.Markdown("")
919
 
 
920
  with gr.Row():
921
  pdf_dl_btn = gr.Button("πŸ“₯ Download PDF Report", variant="primary")
922
  pdf_file_out = gr.File(label="Compliance Report", visible=False, interactive=False)
explanation.py CHANGED
@@ -1,15 +1,10 @@
1
  # explanation.py
2
- # Template-based explanation engine β€” adds risk_breakdown, evidence list,
3
- # confidence_level, natural_language_summary placeholder, and a formatted
4
- # score_breakdown_text.
5
 
6
  from __future__ import annotations
7
  from inference import RULE_FEATURE_DEPS
8
 
9
 
10
- # ─────────────────────────────────────────────────────────────────────────────
11
- # Per-rule explanations (kept identical to v5.1)
12
- # ─────────────────────────────────────────────────────────────────────────────
13
  RULE_EXPLANATIONS = {
14
  "ICA_001": {
15
  "why": "Liability is capped even for gross negligence or wilful misconduct.",
@@ -32,9 +27,9 @@ RULE_EXPLANATIONS = {
32
  "suggestion": "Link the damages figure to a genuine pre-estimate of foreseeable loss.",
33
  },
34
  "ICA_005": {
35
- "why": "The clause is contingent on an uncertain event in a way that resembles a wager.",
36
  "meaning": "Such agreements are void under Indian Contract Act S.30.",
37
- "suggestion": "Remove or restructure the contingency to avoid wagering characteristics.",
38
  },
39
  "ICA_006": {
40
  "why": "The clause restricts a party from pursuing legal proceedings.",
@@ -42,8 +37,8 @@ RULE_EXPLANATIONS = {
42
  "suggestion": "Replace with a structured dispute-resolution mechanism (arbitration / mediation).",
43
  },
44
  "ICA_007": {
45
- "why": "An indemnity obligation is created without a cap on its monetary exposure.",
46
- "meaning": "You could face unlimited downstream financial liability for third-party claims.",
47
  "suggestion": "Cap the indemnity at a multiple of contract value and carve out consequential losses.",
48
  },
49
  "ICA_008": {
@@ -57,7 +52,7 @@ RULE_EXPLANATIONS = {
57
  "suggestion": "Set the seat of arbitration in a neutral, accessible Indian city (e.g. Mumbai, Delhi).",
58
  },
59
  "ICA_010": {
60
- "why": "Exclusivity is imposed without a defined term, making it open-ended.",
61
  "meaning": "Indefinite restraints of trade are typically void under Indian Contract Act S.27.",
62
  "suggestion": "Fix a clear exclusivity term (e.g. 1-3 years) with defined renewal mechanics.",
63
  },
@@ -98,9 +93,6 @@ RULE_EXPLANATIONS = {
98
  },
99
  }
100
 
101
- # ─────────────────────────────────────────────────────────────────────────────
102
- # Risk-level prose
103
- # ─────────────────────────────────────────────────────────────────────────────
104
  RISK_CONTEXT = {
105
  "Low": "This clause appears relatively standard with minimal legal exposure.",
106
  "Medium": "This clause contains terms that warrant careful review before signing.",
@@ -117,9 +109,6 @@ CATEGORY_CONTEXT = {
117
  }
118
 
119
 
120
- # ─────────────────────────────────────────────────────────────────────────────
121
- # Helpers
122
- # ─────────────────────────────────────────────────────────────────────────────
123
  def _evidence_for_rule(rule_id: str, evidence: dict) -> list[dict]:
124
  deps = RULE_FEATURE_DEPS.get(rule_id, [])
125
  snippets: list[dict] = []
@@ -134,7 +123,6 @@ def _evidence_for_rule(rule_id: str, evidence: dict) -> list[dict]:
134
 
135
 
136
  def _flat_evidence(evidence: dict) -> list[dict]:
137
- """Riya's Prompt 1 evidence shape."""
138
  out = []
139
  for feat, hits in (evidence or {}).items():
140
  for h in hits:
@@ -154,36 +142,14 @@ def _format_score_breakdown_text(breakdown: dict | None, fused: float) -> str:
154
  nrm = breakdown.get("neural_score", 0.0)
155
  sym = breakdown.get("symbolic_score", 0.0)
156
  fin = breakdown.get("final", fused)
157
- note = " [floor 0.30 applied]" if breakdown.get("floor_applied") else ""
158
  return (
159
  f"Final Score = {fin:.2f} "
160
- f"(Neural {nrm:.2f} x {w.get('neural', 0):.2f} + "
161
- f"Symbolic {sym:.2f} x {w.get('symbolic', 0):.2f}){note}"
162
  )
163
 
164
 
165
- # ─────────────────────────────────────────────────────────────────────────────
166
- # Main entry point
167
- # ─────────────────────────────────────────────────────────────────────────────
168
  def generate_explanation(text: str, result: dict) -> dict:
169
- """
170
- Returns:
171
- {
172
- # ── Original keys (backward compatible) ──
173
- "overview": str,
174
- "rules": list[dict], # with per-rule evidence
175
- "general_tip": str,
176
- "score_breakdown": dict | None,
177
- "confidence": dict | None,
178
-
179
- # ── New keys (Prompt 1) ──
180
- "risk_breakdown": dict # neural/symbolic/weights/final
181
- "evidence": list[dict], # flat list across all features
182
- "confidence_level": str # "Low"|"Medium"|"High"
183
- "natural_language_summary": str # filled by NLSummarizer (placeholder here)
184
- "score_breakdown_text": str # human-readable formula string
185
- }
186
- """
187
  level_raw = result.get("risk_level_raw", "Low")
188
  triggered = result.get("triggered_rules", [])
189
  top_cats = result.get("top_risk_cats", [])
@@ -192,7 +158,6 @@ def generate_explanation(text: str, result: dict) -> dict:
192
  breakdown = result.get("score_breakdown")
193
  confidence = result.get("confidence") or {}
194
 
195
- # ── Overview sentence ──────────────────────────────────────────────────
196
  primary_cat = top_cats[0][0] if top_cats else "structural"
197
  cat_desc = CATEGORY_CONTEXT.get(primary_cat, "legal concerns")
198
  overview = (
@@ -200,7 +165,6 @@ def generate_explanation(text: str, result: dict) -> dict:
200
  f"The primary concern is {cat_desc} (fused risk score: {risk_score:.2f})."
201
  )
202
 
203
- # ── Per-rule explanations ──────────────────────────────────────────────
204
  rule_details = []
205
  for rule in triggered:
206
  rid = rule.get("rule_id", "")
@@ -217,7 +181,6 @@ def generate_explanation(text: str, result: dict) -> dict:
217
  "evidence": _evidence_for_rule(rid, evidence),
218
  })
219
 
220
- # ── General tip ────────────────────────────────────────────────────────
221
  general_tip = ""
222
  if not triggered:
223
  if level_raw == "Low":
@@ -229,31 +192,28 @@ def generate_explanation(text: str, result: dict) -> dict:
229
  general_tip = ("High neural risk score despite no specific rule triggers. "
230
  "The clause may contain broad or one-sided language - seek legal review.")
231
 
232
- # ── Risk breakdown dict (Prompt 1) ─────────────────────────────────────
233
  risk_breakdown = breakdown or {
234
  "neural_score": result.get("neural_score", 0.0),
235
  "symbolic_score": result.get("symbolic_score", 0.0),
236
- "weights": {"neural": 0.60, "symbolic": 0.40},
237
  "raw_fused": risk_score,
238
  "floor_applied": False,
239
  "final": risk_score,
240
- "formula": f"({0.60} Γ— {result.get('neural_score', 0):.3f}) + "
241
- f"({0.40} Γ— {result.get('symbolic_score', 0):.3f}) "
242
  f"= {risk_score:.3f}",
243
  }
244
 
245
  return {
246
- # Original / backward-compatible
247
  "overview": overview,
248
  "rules": rule_details,
249
  "general_tip": general_tip,
250
  "score_breakdown": breakdown,
251
  "confidence": confidence,
252
-
253
- # Prompt 1 additions
254
  "risk_breakdown": risk_breakdown,
255
  "evidence": _flat_evidence(evidence),
256
  "confidence_level": confidence.get("level", "Medium"),
257
- "natural_language_summary": "", # filled later by NLSummarizer
258
  "score_breakdown_text": _format_score_breakdown_text(breakdown, risk_score),
259
  }
 
1
  # explanation.py
2
+ # v5.3 β€” fallback risk_breakdown weights aligned to new fusion config.
 
 
3
 
4
  from __future__ import annotations
5
  from inference import RULE_FEATURE_DEPS
6
 
7
 
 
 
 
8
  RULE_EXPLANATIONS = {
9
  "ICA_001": {
10
  "why": "Liability is capped even for gross negligence or wilful misconduct.",
 
27
  "suggestion": "Link the damages figure to a genuine pre-estimate of foreseeable loss.",
28
  },
29
  "ICA_005": {
30
+ "why": "The clause uses gambling, wagering, or betting vocabulary.",
31
  "meaning": "Such agreements are void under Indian Contract Act S.30.",
32
+ "suggestion": "Remove or restructure the wagering element of this clause.",
33
  },
34
  "ICA_006": {
35
  "why": "The clause restricts a party from pursuing legal proceedings.",
 
37
  "suggestion": "Replace with a structured dispute-resolution mechanism (arbitration / mediation).",
38
  },
39
  "ICA_007": {
40
+ "why": "An indemnity obligation is paired with uncapped / unlimited liability language.",
41
+ "meaning": "You could face open-ended financial exposure for third-party claims.",
42
  "suggestion": "Cap the indemnity at a multiple of contract value and carve out consequential losses.",
43
  },
44
  "ICA_008": {
 
52
  "suggestion": "Set the seat of arbitration in a neutral, accessible Indian city (e.g. Mumbai, Delhi).",
53
  },
54
  "ICA_010": {
55
+ "why": "Exclusivity rights are granted without a defined term, making them open-ended.",
56
  "meaning": "Indefinite restraints of trade are typically void under Indian Contract Act S.27.",
57
  "suggestion": "Fix a clear exclusivity term (e.g. 1-3 years) with defined renewal mechanics.",
58
  },
 
93
  },
94
  }
95
 
 
 
 
96
  RISK_CONTEXT = {
97
  "Low": "This clause appears relatively standard with minimal legal exposure.",
98
  "Medium": "This clause contains terms that warrant careful review before signing.",
 
109
  }
110
 
111
 
 
 
 
112
  def _evidence_for_rule(rule_id: str, evidence: dict) -> list[dict]:
113
  deps = RULE_FEATURE_DEPS.get(rule_id, [])
114
  snippets: list[dict] = []
 
123
 
124
 
125
  def _flat_evidence(evidence: dict) -> list[dict]:
 
126
  out = []
127
  for feat, hits in (evidence or {}).items():
128
  for h in hits:
 
142
  nrm = breakdown.get("neural_score", 0.0)
143
  sym = breakdown.get("symbolic_score", 0.0)
144
  fin = breakdown.get("final", fused)
 
145
  return (
146
  f"Final Score = {fin:.2f} "
147
+ f"(Neural {nrm:.2f} Γ— {w.get('neural', 0):.2f} + "
148
+ f"Symbolic {sym:.2f} Γ— {w.get('symbolic', 0):.2f})"
149
  )
150
 
151
 
 
 
 
152
  def generate_explanation(text: str, result: dict) -> dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  level_raw = result.get("risk_level_raw", "Low")
154
  triggered = result.get("triggered_rules", [])
155
  top_cats = result.get("top_risk_cats", [])
 
158
  breakdown = result.get("score_breakdown")
159
  confidence = result.get("confidence") or {}
160
 
 
161
  primary_cat = top_cats[0][0] if top_cats else "structural"
162
  cat_desc = CATEGORY_CONTEXT.get(primary_cat, "legal concerns")
163
  overview = (
 
165
  f"The primary concern is {cat_desc} (fused risk score: {risk_score:.2f})."
166
  )
167
 
 
168
  rule_details = []
169
  for rule in triggered:
170
  rid = rule.get("rule_id", "")
 
181
  "evidence": _evidence_for_rule(rid, evidence),
182
  })
183
 
 
184
  general_tip = ""
185
  if not triggered:
186
  if level_raw == "Low":
 
192
  general_tip = ("High neural risk score despite no specific rule triggers. "
193
  "The clause may contain broad or one-sided language - seek legal review.")
194
 
195
+ # v5.3: fallback weights updated to new neural-dominant config
196
  risk_breakdown = breakdown or {
197
  "neural_score": result.get("neural_score", 0.0),
198
  "symbolic_score": result.get("symbolic_score", 0.0),
199
+ "weights": {"neural": 0.75, "symbolic": 0.25},
200
  "raw_fused": risk_score,
201
  "floor_applied": False,
202
  "final": risk_score,
203
+ "formula": f"(0.75 Γ— {result.get('neural_score', 0):.3f}) + "
204
+ f"(0.25 Γ— {result.get('symbolic_score', 0):.3f}) "
205
  f"= {risk_score:.3f}",
206
  }
207
 
208
  return {
 
209
  "overview": overview,
210
  "rules": rule_details,
211
  "general_tip": general_tip,
212
  "score_breakdown": breakdown,
213
  "confidence": confidence,
 
 
214
  "risk_breakdown": risk_breakdown,
215
  "evidence": _flat_evidence(evidence),
216
  "confidence_level": confidence.get("level", "Medium"),
217
+ "natural_language_summary": "",
218
  "score_breakdown_text": _format_score_breakdown_text(breakdown, risk_score),
219
  }
feature_extractor.py CHANGED
@@ -1,186 +1,206 @@
1
  # feature_extractor.py
2
- # Synonym-aware regex feature extractor with rich evidence trace.
3
- # Moved out of app.py to be reusable by LIME wrappers, report generation,
4
- # and downstream explainability tooling.
 
 
5
 
6
  from __future__ import annotations
7
  import re
8
 
9
 
10
  class ClauseFeatureExtractor:
11
- """
12
- Regex-based hybrid extractor.
13
-
14
- Public API:
15
- β€’ extract(text) -> (features, evidence_dict)
16
- features: dict[str, bool|int]
17
- evidence_dict: dict[feature_name, list[hit]]
18
- hit = {"phrase": str, "span": [start, end], "label": str}
19
-
20
- β€’ extract_unified(text) -> dict
21
- Returns the format requested in Riya's Prompt 3:
22
- {
23
- "<feature_name>": True/False/int,
24
- ...,
25
- "evidence": [
26
- {"feature": str, "keywords": [str, ...],
27
- "evidence_text": str, "span": [start, end]},
28
- ...
29
- ],
30
- }
31
-
32
- β€’ flatten_evidence(evidence_dict) -> list[dict]
33
- Convert the nested evidence dict to a flat list of hits
34
- (one hit per matched phrase).
35
- """
36
-
37
  BOOLEAN_PATTERNS: dict[str, list[tuple[re.Pattern, str]]] = {
 
38
  "has_liability_cap": [
39
- (re.compile(r"\bshall\s+not\s+exceed\b", re.I), "shall not exceed"),
40
  (re.compile(r"\b(?:maximum|max\.?|total|aggregate|cumulative)\s+liabilit(?:y|ies)\b", re.I), "max liability"),
41
- (re.compile(r"\bcap(?:ped)?\s+(?:on\s+|of\s+)?liabilit(?:y|ies)\b", re.I), "cap on liability"),
42
  (re.compile(r"\blimited\s+to\s+(?:an?\s+amount|the\s+(?:greater|lesser)|rs\.?|inr|usd|\$|β‚Ή)", re.I), "limited to (amount)"),
43
- (re.compile(r"\bliability\s+(?:is\s+|shall\s+be\s+)?limited\b", re.I), "liability limited"),
44
  ],
45
  "excludes_gross_negligence": [
46
- (re.compile(r"\bgross\s+negligen(?:ce|t)\b", re.I), "gross negligence"),
47
- (re.compile(r"\bwil?l?ful\s+(?:misconduct|default|breach)\b", re.I), "wilful misconduct"),
48
- (re.compile(r"\bintentional\s+(?:misconduct|breach|wrongdoing|act)\b", re.I), "intentional misconduct"),
49
- (re.compile(r"\brecklessness?\b", re.I), "recklessness"),
50
- (re.compile(r"\bbad\s+faith\b", re.I), "bad faith"),
51
- (re.compile(r"\bfraud(?:ulent)?\b", re.I), "fraud"),
52
  ],
 
 
53
  "has_liquidated_damages": [
54
- (re.compile(r"\bliquidated\s+damages?\b", re.I), "liquidated damages"),
55
  (re.compile(r"\bpre[\-\s]?(?:determined|estimated|agreed)\s+damages?\b", re.I), "pre-determined damages"),
56
- (re.compile(r"\bpenalty\s+(?:amount|sum|of)\b", re.I), "penalty amount"),
57
  ],
58
  "damages_exceed_loss": [
59
  (re.compile(r"\bregardless\s+of\s+(?:actual\s+)?(?:loss|damage|harm)\b", re.I), "regardless of loss"),
60
- (re.compile(r"\birrespective\s+of\s+(?:actual\s+)?(?:loss|damage)\b", re.I), "irrespective of loss"),
61
- (re.compile(r"\bwithout\s+proof\s+of\s+(?:damage|loss)\b", re.I), "without proof of loss"),
62
- (re.compile(r"\bpenalty\s+clause\b", re.I), "penalty clause"),
63
  ],
 
 
64
  "is_wagering_clause": [
65
- (re.compile(r"\bcontingent\s+on\s+(?:the\s+)?(?:outcome|happening|occurrence)\b", re.I), "contingent on outcome"),
66
- (re.compile(r"\b(?:wager|bet|bets|betting|gamble|gambling|speculative)\b", re.I), "wager/bet/speculative"),
67
- (re.compile(r"\bdepends?\s+(?:entirely\s+)?on\s+(?:an\s+)?uncertain\s+event\b", re.I), "uncertain event"),
 
 
 
 
 
68
  ],
 
69
  "restrains_legal_proceedings": [
70
  (re.compile(r"\bwaive(?:s|d)?\s+(?:the\s+)?right\s+to\s+(?:sue|bring|file|institute)\b", re.I), "waive right to sue"),
71
  (re.compile(r"\b(?:shall|will)\s+not\s+(?:bring|file|commence|institute)\s+(?:any\s+)?(?:action|suit|proceeding)\b", re.I), "no legal action"),
72
- (re.compile(r"\brelinquish(?:es|ed)?\s+(?:the\s+)?right\b", re.I), "relinquish right"),
73
- (re.compile(r"\b(?:no|barred\s+from)\s+legal\s+proceedings?\b", re.I), "no legal proceedings"),
74
  ],
 
 
75
  "unilateral_termination": [
76
- (re.compile(r"\bmay\s+terminate\b", re.I), "may terminate"),
77
- (re.compile(r"\bsole\s+(?:discretion|option)\b", re.I), "sole discretion"),
78
  (re.compile(r"\bterminate.*?\b(?:without\s+cause|for\s+convenience|at\s+will|unilaterally)\b", re.I), "without cause"),
79
- (re.compile(r"\bat\s+(?:its|the)\s+(?:sole\s+)?discretion\b", re.I), "at its discretion"),
80
  ],
81
  "notice_period_defined": [
82
  (re.compile(r"\b\d+\s*(?:days?|weeks?|months?)\s*(?:prior\s+)?(?:written\s+)?notice\b", re.I), "N days notice"),
83
- (re.compile(r"\bnotice\s+period\s+of\s+\d+\b", re.I), "notice period of N"),
84
  ],
 
 
85
  "processes_personal_data": [
86
- (re.compile(r"\bpersonal\s+(?:data|information)\b", re.I), "personal data"),
87
  (re.compile(r"\b(?:collect|process|handle|store)\s+(?:and\s+\w+\s+)?(?:user|customer|individual)\s+(?:data|information)\b", re.I), "process user data"),
88
- (re.compile(r"\bdata\s+(?:subject|principal)\b", re.I), "data subject/principal"),
89
- (re.compile(r"\bpii\b", re.I), "PII"),
90
  ],
91
  "processes_sensitive_data": [
92
- (re.compile(r"\bsensitive\s+personal\s+(?:data|information)\b", re.I), "sensitive personal data"),
93
  (re.compile(r"\b(?:health|medical|financial|biometric|aadhaar|aadhar)\s+(?:data|information|details)\b", re.I), "sensitive category"),
94
  ],
95
  "has_data_retention_clause": [
96
- (re.compile(r"\bretention\s+period\b", re.I), "retention period"),
97
  (re.compile(r"\bretain(?:ed|s)?\s+(?:for\s+)?(?:a\s+period\s+of\s+)?\d+\b", re.I), "retain for N"),
98
  (re.compile(r"\bdata\s+shall\s+be\s+(?:deleted|purged|anonymised|destroyed)\b", re.I), "data deletion"),
99
- (re.compile(r"\bpurge|anonymise|delete\s+(?:after|upon)\b", re.I), "purge/delete after"),
100
  ],
101
  "has_breach_notification": [
102
- (re.compile(r"\bbreach\s+notification\b", re.I), "breach notification"),
103
- (re.compile(r"\bnotify\s+(?:of\s+)?(?:any\s+)?(?:data\s+)?breach\b", re.I), "notify of breach"),
104
  (re.compile(r"\b(?:report|inform|notify)\s+(?:the\s+\w+\s+)?within\s+\d+\s*(?:hours?|days?)\b", re.I), "notify within N"),
105
- (re.compile(r"\bsecurity\s+incident\b", re.I), "security incident"),
106
  ],
107
  "has_consent_clause": [
108
  (re.compile(r"\b(?:with|upon|after)\s+(?:the\s+)?(?:prior\s+|explicit\s+|written\s+)?consent\s+of\b", re.I), "with consent of"),
109
- (re.compile(r"\bopt[\-\s]?in\b", re.I), "opt-in"),
110
- (re.compile(r"\bexplicit\s+consent\b", re.I), "explicit consent"),
111
  ],
112
  "handles_digital_data": [
113
- (re.compile(r"\b(?:digital|electronic|online|cloud[\-\s]?based)\b", re.I), "digital/online/cloud"),
114
- (re.compile(r"\b(?:server|database|api|software\s+platform|saas)\b", re.I), "server/db/SaaS"),
115
  ],
116
  "has_security_clause": [
117
- (re.compile(r"\b(?:reasonable\s+)?security\s+measures?\b", re.I), "security measures"),
118
- (re.compile(r"\bencryption\b", re.I), "encryption"),
119
- (re.compile(r"\bcyber[\-\s]?security\b", re.I), "cybersecurity"),
120
- (re.compile(r"\baccess\s+controls?\b", re.I), "access controls"),
121
- (re.compile(r"\biso\s*27001\b", re.I), "ISO 27001"),
122
  ],
 
 
123
  "assigns_all_ip": [
124
- (re.compile(r"\ball\s+intellectual\s+property\b", re.I), "all intellectual property"),
125
- (re.compile(r"\bassigns?\s+(?:all\s+)?(?:rights?|ip|intellectual)\b", re.I), "assigns all IP"),
126
  (re.compile(r"\b(?:belongs|vests|shall\s+vest)\s+(?:in|to)\s+(?:the\s+)?(?:client|company|customer|employer)\b", re.I), "vests in client"),
127
  ],
128
  "includes_pre_existing_ip": [
129
- (re.compile(r"\bpre[\-\s]?existing\b", re.I), "pre-existing"),
130
- (re.compile(r"\bbackground\s+(?:ip|intellectual\s+property)\b", re.I), "background IP"),
131
- (re.compile(r"\bprior\s+to\s+(?:the\s+)?engagement\b", re.I), "prior to engagement"),
132
  ],
 
 
133
  "is_consumer_contract": [
134
  (re.compile(r"\b(?:consumer|end[\-\s]?user|retail\s+customer|individual\s+customer)\b", re.I), "consumer/end-user"),
135
  ],
136
  "has_one_sided_clause": [
137
- (re.compile(r"\bsole\s+discretion\b", re.I), "sole discretion"),
138
- (re.compile(r"\bwithout\s+(?:any\s+)?liability\b", re.I), "without liability"),
139
- (re.compile(r"\bno\s+obligation\s+(?:to|whatsoever)\b", re.I), "no obligation"),
140
- (re.compile(r"\babsolute\s+(?:right|discretion)\b", re.I), "absolute right"),
141
- (re.compile(r"\bunconditionally\b", re.I), "unconditionally"),
 
 
142
  ],
143
- # Indemnity
 
144
  "has_indemnity_clause": [
145
- (re.compile(r"\bindemnif(?:y|ies|ied|ication)\b", re.I), "indemnify/indemnification"),
146
- (re.compile(r"\bhold\s+harmless\b", re.I), "hold harmless"),
147
- (re.compile(r"\bdefend\s+(?:and\s+indemnify|the\s+\w+\s+against)\b", re.I), "defend & indemnify"),
148
  ],
149
  "indemnity_capped": [
150
  (re.compile(r"\bindemn\w*[^.]{0,80}?\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)\b", re.I), "indemnity cap"),
151
  (re.compile(r"\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)[^.]{0,80}?\bindemn\w*\b", re.I), "indemnity cap"),
152
  ],
153
- # Auto-renewal
 
 
 
 
 
 
 
 
 
 
154
  "has_auto_renewal": [
155
- (re.compile(r"\bauto(?:matically)?[\-\s]?renew(?:s|al|ed|ing)?\b", re.I), "auto-renew"),
156
- (re.compile(r"\brenew(?:s|al)?\s+automatically\b", re.I), "renews automatically"),
157
- (re.compile(r"\bevergreen\s+(?:clause|term)\b", re.I), "evergreen clause"),
158
- (re.compile(r"\bshall\s+continue\s+(?:to\s+)?renew\b", re.I), "continue to renew"),
159
  ],
160
  "has_opt_out_window": [
161
  (re.compile(r"\b\d+\s*(?:days?|weeks?|months?)[^.]{0,60}?(?:prior\s+to|before)\s+(?:the\s+)?(?:renewal|expiry|expiration|end)\b", re.I), "notice before renewal"),
162
- (re.compile(r"\bnon[\-\s]?renewal\s+notice\b", re.I), "non-renewal notice"),
163
- (re.compile(r"\b(?:opt[\-\s]?out|terminate)\s+(?:the\s+)?renewal\b", re.I), "opt-out renewal"),
 
 
164
  ],
165
- # Arbitration
 
166
  "has_arbitration": [
167
- (re.compile(r"\barbitrat(?:ion|or|al)\b", re.I), "arbitration"),
168
- (re.compile(r"\barbitral\s+tribunal\b", re.I), "arbitral tribunal"),
169
  ],
170
  "arbitration_distant_venue": [
171
- (re.compile(r"\b(?:seat|venue|place)\s+of\s+arbitration\s+(?:shall\s+be\s+|is\s+)?(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign seat"),
172
- (re.compile(r"\barbitrat\w*[^.]{0,60}?\b(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign arbitration venue"),
173
  ],
174
- # Exclusivity
 
175
  "has_exclusivity": [
176
- (re.compile(r"\bexclusiv(?:e|ity|ely)\b", re.I), "exclusive/exclusivity"),
177
- (re.compile(r"\bsole\s+(?:and\s+exclusive\s+)?(?:supplier|provider|vendor|distributor|licensee)\b", re.I), "sole & exclusive"),
 
 
 
 
 
 
 
178
  ],
179
  "exclusivity_term_defined": [
180
- (re.compile(r"\bexclusiv\w*[^.]{0,80}?\b(?:for\s+a\s+period\s+of|until|through|expires?|terminates?)\b[^.]{0,30}?\d+\b", re.I), "exclusivity term"),
181
- (re.compile(r"\bexclusiv\w*[^.]{0,80}?\b\d+\s*(?:years?|months?)\b", re.I), "exclusivity duration"),
 
182
  ],
183
- # Unilateral pricing
 
184
  "unilateral_price_change": [
185
  (re.compile(r"\bmay\s+(?:change|modify|adjust|revise|increase|update)\s+(?:the\s+)?(?:prices?|fees?|charges?|rates?)\b", re.I), "may change prices"),
186
  (re.compile(r"\b(?:prices?|fees?|charges?|rates?)\s+(?:may\s+be|are\s+subject\s+to)\s+(?:changed|modified|adjusted|revised)\s+at\s+(?:our|its|the\s+\w+s?)\s+(?:sole\s+)?discretion\b", re.I), "prices changed at discretion"),
@@ -197,7 +217,6 @@ class ClauseFeatureExtractor:
197
  "has_exclusivity": ["exclusivity_term_defined"],
198
  }
199
 
200
- # ── Core extraction ────────────────────────────────────────────────────
201
  def extract(self, text: str) -> tuple[dict, dict]:
202
  features: dict = {}
203
  evidence: dict = {}
@@ -229,6 +248,7 @@ class ClauseFeatureExtractor:
229
  if child not in features:
230
  features[child] = False
231
 
 
232
  m = re.search(
233
  r'(\d+)\s*(?:\(\w+\)\s*)?\s*years?\s*(?:of\s+)?(?:the\s+)?non[\-\s]?compet|'
234
  r'non[\-\s]?compet[a-z]*[^.]{0,40}?(\d+)\s*years?',
@@ -243,15 +263,13 @@ class ClauseFeatureExtractor:
243
  "span": [start, end],
244
  "label": f"{yrs}-year non-compete",
245
  }]
246
- elif any(re.search(p, text, re.I) for p in (
247
- r"\bnon[\-\s]?compete\b", r"\bshall\s+not\s+compete\b")):
248
  features["non_compete_years"] = 1
249
 
250
  return features, evidence
251
 
252
- # ── Convenience accessors ──────────────────────────────────────────────
253
  def extract_unified(self, text: str) -> dict:
254
- """Riya's Prompt 3 format: features merged with a flat evidence list."""
255
  features, evidence_dict = self.extract(text)
256
  out = dict(features)
257
  out["evidence"] = self.flatten_evidence(evidence_dict)
 
1
  # feature_extractor.py
2
+ # v5.3 β€” precision-tightened regex pack.
3
+ # Critical changes:
4
+ # β€’ is_wagering_clause: strict gambling vocab only (no "contingent on …")
5
+ # β€’ has_exclusivity: contextual phrases only (no bare "exclusive" / "exclusive of")
6
+ # β€’ has_uncapped_signal: NEW β€” gates ICA_007 to require explicit uncapped language
7
 
8
  from __future__ import annotations
9
  import re
10
 
11
 
12
  class ClauseFeatureExtractor:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  BOOLEAN_PATTERNS: dict[str, list[tuple[re.Pattern, str]]] = {
14
+ # ── Liability cap (unchanged) ──────────────────────────────────────
15
  "has_liability_cap": [
16
+ (re.compile(r"\bshall\s+not\s+exceed\b", re.I), "shall not exceed"),
17
  (re.compile(r"\b(?:maximum|max\.?|total|aggregate|cumulative)\s+liabilit(?:y|ies)\b", re.I), "max liability"),
18
+ (re.compile(r"\bcap(?:ped)?\s+(?:on\s+|of\s+)?liabilit(?:y|ies)\b", re.I), "cap on liability"),
19
  (re.compile(r"\blimited\s+to\s+(?:an?\s+amount|the\s+(?:greater|lesser)|rs\.?|inr|usd|\$|β‚Ή)", re.I), "limited to (amount)"),
20
+ (re.compile(r"\bliability\s+(?:is\s+|shall\s+be\s+)?limited\b", re.I), "liability limited"),
21
  ],
22
  "excludes_gross_negligence": [
23
+ (re.compile(r"\bgross\s+negligen(?:ce|t)\b", re.I), "gross negligence"),
24
+ (re.compile(r"\bwil?l?ful\s+(?:misconduct|default|breach)\b", re.I), "wilful misconduct"),
25
+ (re.compile(r"\bintentional\s+(?:misconduct|breach|wrongdoing|act)\b", re.I), "intentional misconduct"),
26
+ (re.compile(r"\brecklessness?\b", re.I), "recklessness"),
27
+ (re.compile(r"\bbad\s+faith\b", re.I), "bad faith"),
28
+ (re.compile(r"\bfraud(?:ulent)?\b", re.I), "fraud"),
29
  ],
30
+
31
+ # ── Liquidated damages (unchanged) ─────────────────────────────────
32
  "has_liquidated_damages": [
33
+ (re.compile(r"\bliquidated\s+damages?\b", re.I), "liquidated damages"),
34
  (re.compile(r"\bpre[\-\s]?(?:determined|estimated|agreed)\s+damages?\b", re.I), "pre-determined damages"),
35
+ (re.compile(r"\bpenalty\s+(?:amount|sum|of)\b", re.I), "penalty amount"),
36
  ],
37
  "damages_exceed_loss": [
38
  (re.compile(r"\bregardless\s+of\s+(?:actual\s+)?(?:loss|damage|harm)\b", re.I), "regardless of loss"),
39
+ (re.compile(r"\birrespective\s+of\s+(?:actual\s+)?(?:loss|damage)\b", re.I), "irrespective of loss"),
40
+ (re.compile(r"\bwithout\s+proof\s+of\s+(?:damage|loss)\b", re.I), "without proof of loss"),
41
+ (re.compile(r"\bpenalty\s+clause\b", re.I), "penalty clause"),
42
  ],
43
+
44
+ # ── WAGERING β€” TIGHTENED (no "contingent on"; explicit gambling vocab only)
45
  "is_wagering_clause": [
46
+ (re.compile(r"\b(?:wager|wagers|wagering|wagered)\b", re.I), "wager"),
47
+ (re.compile(r"\bgambling\b", re.I), "gambling"),
48
+ (re.compile(r"\blotter(?:y|ies)\b", re.I), "lottery"),
49
+ (re.compile(r"\bbetting\b", re.I), "betting"),
50
+ # "bet/bets on X" or "bet against X" β€” require directional preposition
51
+ (re.compile(r"\bbets?\s+(?:on|against)\s+(?:the|any|an|a)\s+\w+", re.I), "bet on/against"),
52
+ # "stake on X" or "stake against X" β€” same treatment
53
+ (re.compile(r"\bstakes?\s+(?:on|against)\s+(?:the|any|an|a)\s+\w+", re.I), "stake on/against"),
54
  ],
55
+
56
  "restrains_legal_proceedings": [
57
  (re.compile(r"\bwaive(?:s|d)?\s+(?:the\s+)?right\s+to\s+(?:sue|bring|file|institute)\b", re.I), "waive right to sue"),
58
  (re.compile(r"\b(?:shall|will)\s+not\s+(?:bring|file|commence|institute)\s+(?:any\s+)?(?:action|suit|proceeding)\b", re.I), "no legal action"),
59
+ (re.compile(r"\brelinquish(?:es|ed)?\s+(?:the\s+)?right\s+to\s+(?:sue|claim|recover)\b", re.I), "relinquish right (to sue)"),
60
+ (re.compile(r"\b(?:no|barred\s+from)\s+legal\s+proceedings?\b", re.I), "no legal proceedings"),
61
  ],
62
+
63
+ # ── Termination (unchanged) ────────────────────────────────────────
64
  "unilateral_termination": [
65
+ (re.compile(r"\bmay\s+terminate\b", re.I), "may terminate"),
 
66
  (re.compile(r"\bterminate.*?\b(?:without\s+cause|for\s+convenience|at\s+will|unilaterally)\b", re.I), "without cause"),
67
+ (re.compile(r"\bat\s+(?:its|the)\s+sole\s+discretion\b[^.]{0,40}?\bterminat", re.I), "terminate at sole discretion"),
68
  ],
69
  "notice_period_defined": [
70
  (re.compile(r"\b\d+\s*(?:days?|weeks?|months?)\s*(?:prior\s+)?(?:written\s+)?notice\b", re.I), "N days notice"),
71
+ (re.compile(r"\bnotice\s+period\s+of\s+\d+\b", re.I), "notice period of N"),
72
  ],
73
+
74
+ # ── DPDPA / IT Act / personal-data signals (unchanged) ─────────────
75
  "processes_personal_data": [
76
+ (re.compile(r"\bpersonal\s+(?:data|information)\b", re.I), "personal data"),
77
  (re.compile(r"\b(?:collect|process|handle|store)\s+(?:and\s+\w+\s+)?(?:user|customer|individual)\s+(?:data|information)\b", re.I), "process user data"),
78
+ (re.compile(r"\bdata\s+(?:subject|principal)\b", re.I), "data subject/principal"),
79
+ (re.compile(r"\bpii\b", re.I), "PII"),
80
  ],
81
  "processes_sensitive_data": [
82
+ (re.compile(r"\bsensitive\s+personal\s+(?:data|information)\b", re.I), "sensitive personal data"),
83
  (re.compile(r"\b(?:health|medical|financial|biometric|aadhaar|aadhar)\s+(?:data|information|details)\b", re.I), "sensitive category"),
84
  ],
85
  "has_data_retention_clause": [
86
+ (re.compile(r"\bretention\s+period\b", re.I), "retention period"),
87
  (re.compile(r"\bretain(?:ed|s)?\s+(?:for\s+)?(?:a\s+period\s+of\s+)?\d+\b", re.I), "retain for N"),
88
  (re.compile(r"\bdata\s+shall\s+be\s+(?:deleted|purged|anonymised|destroyed)\b", re.I), "data deletion"),
89
+ (re.compile(r"\b(?:purge|anonymise|delete)\s+(?:after|upon)\b", re.I), "purge/delete after"),
90
  ],
91
  "has_breach_notification": [
92
+ (re.compile(r"\bbreach\s+notification\b", re.I), "breach notification"),
93
+ (re.compile(r"\bnotify\s+(?:of\s+)?(?:any\s+)?(?:data\s+)?breach\b", re.I), "notify of breach"),
94
  (re.compile(r"\b(?:report|inform|notify)\s+(?:the\s+\w+\s+)?within\s+\d+\s*(?:hours?|days?)\b", re.I), "notify within N"),
95
+ (re.compile(r"\bsecurity\s+incident\b", re.I), "security incident"),
96
  ],
97
  "has_consent_clause": [
98
  (re.compile(r"\b(?:with|upon|after)\s+(?:the\s+)?(?:prior\s+|explicit\s+|written\s+)?consent\s+of\b", re.I), "with consent of"),
99
+ (re.compile(r"\bopt[\-\s]?in\b", re.I), "opt-in"),
100
+ (re.compile(r"\bexplicit\s+consent\b", re.I), "explicit consent"),
101
  ],
102
  "handles_digital_data": [
103
+ (re.compile(r"\b(?:digital|electronic|online|cloud[\-\s]?based)\b", re.I), "digital/online/cloud"),
104
+ (re.compile(r"\b(?:server|database|api|software\s+platform|saas)\b", re.I), "server/db/SaaS"),
105
  ],
106
  "has_security_clause": [
107
+ (re.compile(r"\b(?:reasonable\s+)?security\s+measures?\b", re.I), "security measures"),
108
+ (re.compile(r"\bencryption\b", re.I), "encryption"),
109
+ (re.compile(r"\bcyber[\-\s]?security\b", re.I), "cybersecurity"),
110
+ (re.compile(r"\baccess\s+controls?\b", re.I), "access controls"),
111
+ (re.compile(r"\biso\s*27001\b", re.I), "ISO 27001"),
112
  ],
113
+
114
+ # ── IP (unchanged) ─────────────────────────────────────────────────
115
  "assigns_all_ip": [
116
+ (re.compile(r"\ball\s+intellectual\s+property\b", re.I), "all intellectual property"),
117
+ (re.compile(r"\bassigns?\s+(?:all\s+)?(?:rights?|ip|intellectual)\b", re.I), "assigns all IP"),
118
  (re.compile(r"\b(?:belongs|vests|shall\s+vest)\s+(?:in|to)\s+(?:the\s+)?(?:client|company|customer|employer)\b", re.I), "vests in client"),
119
  ],
120
  "includes_pre_existing_ip": [
121
+ (re.compile(r"\bpre[\-\s]?existing\s+(?:ip|intellectual|materials?|works?|inventions?)\b", re.I), "pre-existing"),
122
+ (re.compile(r"\bbackground\s+(?:ip|intellectual\s+property)\b", re.I), "background IP"),
123
+ (re.compile(r"\bprior\s+to\s+(?:the\s+)?engagement\b", re.I), "prior to engagement"),
124
  ],
125
+
126
+ # ── Consumer (one-sided narrowed slightly) ─────────────────────────
127
  "is_consumer_contract": [
128
  (re.compile(r"\b(?:consumer|end[\-\s]?user|retail\s+customer|individual\s+customer)\b", re.I), "consumer/end-user"),
129
  ],
130
  "has_one_sided_clause": [
131
+ # "sole discretion" must be paired with a unilateral action verb to
132
+ # avoid firing on operational discretion language.
133
+ (re.compile(r"\bsole\s+discretion\b[^.]{0,50}?\b(?:terminate|modify|change|amend|deny|reject|refuse)\b", re.I), "sole discretion to terminate/modify"),
134
+ (re.compile(r"\bwithout\s+(?:any\s+)?liability\b", re.I), "without liability"),
135
+ (re.compile(r"\bno\s+obligation\s+(?:to|whatsoever)\b", re.I), "no obligation"),
136
+ (re.compile(r"\babsolute\s+(?:right|discretion)\b", re.I), "absolute right"),
137
+ (re.compile(r"\bunconditionally\b", re.I), "unconditionally"),
138
  ],
139
+
140
+ # ── Indemnity ──────────────────────────────────────────────────────
141
  "has_indemnity_clause": [
142
+ (re.compile(r"\bindemnif(?:y|ies|ied|ication)\b", re.I), "indemnify/indemnification"),
143
+ (re.compile(r"\bhold\s+harmless\b", re.I), "hold harmless"),
144
+ (re.compile(r"\bdefend\s+(?:and\s+indemnify|the\s+\w+\s+against)\b", re.I), "defend & indemnify"),
145
  ],
146
  "indemnity_capped": [
147
  (re.compile(r"\bindemn\w*[^.]{0,80}?\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)\b", re.I), "indemnity cap"),
148
  (re.compile(r"\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)[^.]{0,80}?\bindemn\w*\b", re.I), "indemnity cap"),
149
  ],
150
+
151
+ # ── NEW: uncapped / unlimited liability signals (gates ICA_007) ───
152
+ "has_uncapped_signal": [
153
+ (re.compile(r"\bunlimited\s+(?:liabilit(?:y|ies)|exposure|damages?)\b", re.I), "unlimited liability"),
154
+ (re.compile(r"\bwithout\s+(?:any\s+)?(?:limit|limitation|cap|ceiling)\b", re.I), "without limit"),
155
+ (re.compile(r"\bno\s+(?:cap|limit|ceiling|maximum)\b", re.I), "no cap"),
156
+ (re.compile(r"\bany\s+and\s+all\s+(?:claims?|damages?|losses?|expenses?|liabilit(?:y|ies))\b", re.I), "any and all claims/damages"),
157
+ (re.compile(r"\b(?:all|every)\s+(?:and\s+any\s+)?(?:claims?|damages?|losses?|liabilit(?:y|ies))\s+(?:whatsoever|of\s+any\s+kind)\b", re.I), "all damages whatsoever"),
158
+ ],
159
+
160
+ # ── Auto-renewal (unchanged) ───────────────────────────────────────
161
  "has_auto_renewal": [
162
+ (re.compile(r"\bauto(?:matically)?[\-\s]?renew(?:s|al|ed|ing)?\b", re.I), "auto-renew"),
163
+ (re.compile(r"\brenew(?:s|al)?\s+automatically\b", re.I), "renews automatically"),
164
+ (re.compile(r"\bevergreen\s+(?:clause|term)\b", re.I), "evergreen clause"),
165
+ (re.compile(r"\bshall\s+continue\s+(?:to\s+)?renew\b", re.I), "continue to renew"),
166
  ],
167
  "has_opt_out_window": [
168
  (re.compile(r"\b\d+\s*(?:days?|weeks?|months?)[^.]{0,60}?(?:prior\s+to|before)\s+(?:the\s+)?(?:renewal|expiry|expiration|end)\b", re.I), "notice before renewal"),
169
+ (re.compile(r"\bnon[\-\s]?renewal\s+notice\b", re.I), "non-renewal notice"),
170
+ (re.compile(r"\b(?:opt[\-\s]?out|terminate)\s+(?:the\s+)?renewal\b", re.I), "opt-out renewal"),
171
+ # Plain "30 days notice" near "renew" is a common opt-out
172
+ (re.compile(r"\brenew\w*[^.]{0,80}?\b\d+\s*(?:days?|weeks?|months?)\s+(?:written\s+)?notice\b", re.I), "N-day notice before renewal"),
173
  ],
174
+
175
+ # ── Arbitration (unchanged) ────────────────────────────────────────
176
  "has_arbitration": [
177
+ (re.compile(r"\barbitrat(?:ion|or|al)\b", re.I), "arbitration"),
178
+ (re.compile(r"\barbitral\s+tribunal\b", re.I), "arbitral tribunal"),
179
  ],
180
  "arbitration_distant_venue": [
181
+ (re.compile(r"\b(?:seat|venue|place)\s+of\s+arbitration\s+(?:shall\s+be\s+|is\s+|in\s+)?(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign seat"),
182
+ (re.compile(r"\barbitrat\w*[^.]{0,60}?\b(?:in|at)\s+(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign arbitration venue"),
183
  ],
184
+
185
+ # ── EXCLUSIVITY β€” TIGHTENED (won't match "exclusive of …") ────────
186
  "has_exclusivity": [
187
+ # Contextual nouns β€” "exclusive rights", "exclusive license", "exclusive distributor", etc.
188
+ (re.compile(r"\bexclusive\s+(?:right|rights|licen[sc]e|licen[sc]ee|distributor|supplier|vendor|agent|territory|market|partner|reseller|dealer)\b", re.I), "exclusive [right/license/distributor/...]"),
189
+ # Canonical exclusivity idioms
190
+ (re.compile(r"\bsole\s+and\s+exclusive\b", re.I), "sole and exclusive"),
191
+ (re.compile(r"\bshall\s+exclusively\b", re.I), "shall exclusively"),
192
+ (re.compile(r"\bgrant(?:s|ed|ing)?\s+(?:an?\s+|the\s+)?exclusive\s+(?:right|rights|licen[sc]e)\b", re.I), "grants exclusive right/license"),
193
+ # "exclusively to/with/for [party]" β€” directional, not "exclusive of"
194
+ (re.compile(r"\bexclusively\s+(?:to|with|for|by)\s+(?:the|its|a)\s+\w+", re.I), "exclusively to/with"),
195
+ (re.compile(r"\bon\s+an\s+exclusive\s+basis\b", re.I), "on an exclusive basis"),
196
  ],
197
  "exclusivity_term_defined": [
198
+ (re.compile(r"\bexclusiv\w*[^.]{0,80}?\b(?:for\s+a\s+(?:period|term)\s+of|until|through|expires?|terminates?)\b[^.]{0,30}?\d+", re.I), "exclusivity term"),
199
+ (re.compile(r"\bexclusiv\w*[^.]{0,80}?\b\d+\s*(?:years?|months?)\b", re.I), "exclusivity duration"),
200
+ (re.compile(r"\bexclusiv\w*[^.]{0,80}?\bduring\s+the\s+(?:term|initial\s+term)\b", re.I), "during the term"),
201
  ],
202
+
203
+ # ── Pricing (unchanged) ────────────────────────────────────────────
204
  "unilateral_price_change": [
205
  (re.compile(r"\bmay\s+(?:change|modify|adjust|revise|increase|update)\s+(?:the\s+)?(?:prices?|fees?|charges?|rates?)\b", re.I), "may change prices"),
206
  (re.compile(r"\b(?:prices?|fees?|charges?|rates?)\s+(?:may\s+be|are\s+subject\s+to)\s+(?:changed|modified|adjusted|revised)\s+at\s+(?:our|its|the\s+\w+s?)\s+(?:sole\s+)?discretion\b", re.I), "prices changed at discretion"),
 
217
  "has_exclusivity": ["exclusivity_term_defined"],
218
  }
219
 
 
220
  def extract(self, text: str) -> tuple[dict, dict]:
221
  features: dict = {}
222
  evidence: dict = {}
 
248
  if child not in features:
249
  features[child] = False
250
 
251
+ # Non-compete years β€” accept either ordering, but require explicit context
252
  m = re.search(
253
  r'(\d+)\s*(?:\(\w+\)\s*)?\s*years?\s*(?:of\s+)?(?:the\s+)?non[\-\s]?compet|'
254
  r'non[\-\s]?compet[a-z]*[^.]{0,40}?(\d+)\s*years?',
 
263
  "span": [start, end],
264
  "label": f"{yrs}-year non-compete",
265
  }]
266
+ elif re.search(r"\bnon[\-\s]?compete\b|\bshall\s+not\s+compete\b", text, re.I):
 
267
  features["non_compete_years"] = 1
268
 
269
  return features, evidence
270
 
271
+ # ── Convenience accessors (unchanged) ──────────────────────────────────
272
  def extract_unified(self, text: str) -> dict:
 
273
  features, evidence_dict = self.extract(text)
274
  out = dict(features)
275
  out["evidence"] = self.flatten_evidence(evidence_dict)
inference.py CHANGED
@@ -1,20 +1,21 @@
1
  # inference.py
2
- # Pure utility functions for neuro-symbolic fusion, confidence estimation,
3
- # and score-breakdown reporting. No module-level mutable globals β€” all
4
- # stateful objects live in ModelManager (app.py).
 
 
 
5
 
6
  from __future__ import annotations
7
 
8
- # ── Clause types that should be weighted toward symbolic rules ──────────────
9
  IP_CLAUSE_TYPES = {
10
  "IP Ownership Assignment", "Joint IP Ownership",
11
  "Irrevocable Or Perpetual License",
12
  "Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
13
  }
14
 
15
- # ── Mapping: rule_id -> list of features it depends on ──────────────────────
16
- # Used by the explanation engine to surface evidence for each triggered rule
17
- # without inspecting lambda bytecode. Keep in sync with SYMBOLIC_RULES in app.py.
18
  RULE_FEATURE_DEPS = {
19
  "ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
20
  "ICA_002": ["unilateral_termination", "notice_period_defined"],
@@ -22,7 +23,8 @@ RULE_FEATURE_DEPS = {
22
  "ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
23
  "ICA_005": ["is_wagering_clause"],
24
  "ICA_006": ["restrains_legal_proceedings"],
25
- "ICA_007": ["has_indemnity_clause", "indemnity_capped"],
 
26
  "ICA_008": ["has_auto_renewal", "has_opt_out_window"],
27
  "ICA_009": ["has_arbitration", "arbitration_distant_venue"],
28
  "ICA_010": ["has_exclusivity", "exclusivity_term_defined"],
@@ -35,16 +37,22 @@ RULE_FEATURE_DEPS = {
35
  "CPA_001": ["is_consumer_contract", "has_one_sided_clause"],
36
  }
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
40
- """
41
- Evaluate symbolic rules against extracted features.
42
- Returns:
43
- {
44
- "symbolic_score": float, # clamped to [0, 1]
45
- "triggered_rules": list[dict], # rules whose condition fired
46
- }
47
- """
48
  triggered, total = [], 0.0
49
  for rule in symbolic_rules:
50
  try:
@@ -52,7 +60,6 @@ def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
52
  triggered.append(rule)
53
  total += rule["penalty"]
54
  except Exception:
55
- # A malformed rule must not crash inference.
56
  pass
57
  return {
58
  "symbolic_score": round(min(total, 1.0), 3),
@@ -66,46 +73,24 @@ def _neuro_symbolic_fusion(
66
  is_ip_clause: bool = False,
67
  ) -> dict:
68
  """
69
- Weighted fusion of neural and symbolic scores with a transparent breakdown.
70
-
71
- IP clauses shift weight toward symbolic rules (which capture IP-specific law).
72
- A non-zero symbolic score forces a Medium-or-higher floor (since rule triggers
73
- represent deterministic legal violations).
74
-
75
- Returns:
76
- {
77
- "score": float,
78
- "level": "Low" | "Medium" | "High",
79
- "emoji": str,
80
- "breakdown": {
81
- "neural_score": float,
82
- "symbolic_score": float,
83
- "weights": {"neural": float, "symbolic": float},
84
- "raw_fused": float, # pre-floor
85
- "floor_applied": bool,
86
- "final": float,
87
- "formula": str, # human-readable computation
88
- },
89
- }
90
  """
 
 
91
  if is_ip_clause and symbolic > 0:
92
- w_n, w_s = 0.35, 0.65
93
- else:
94
  w_n, w_s = 0.60, 0.40
 
 
95
 
96
  raw = w_n * neural + w_s * symbolic
97
- floor = symbolic > 0 and raw < 0.30
98
- score = max(raw, 0.30) if floor else raw
99
- score = round(min(score, 1.0), 3)
100
 
101
- if score <= 0.33: level, emoji = "Low", "🟒"
102
- elif score <= 0.66: level, emoji = "Medium", "🟑"
103
- else: level, emoji = "High", "πŸ”΄"
104
 
105
  formula = (
106
  f"({w_n:.2f} Γ— {neural:.3f}) + ({w_s:.2f} Γ— {symbolic:.3f}) "
107
  f"= {round(raw, 3)}"
108
- + (f" β†’ floor 0.30 applied (symbolic triggers present)" if floor else "")
109
  )
110
 
111
  return {
@@ -117,7 +102,7 @@ def _neuro_symbolic_fusion(
117
  "symbolic_score": round(symbolic, 3),
118
  "weights": {"neural": w_n, "symbolic": w_s},
119
  "raw_fused": round(raw, 3),
120
- "floor_applied": bool(floor),
121
  "final": score,
122
  "formula": formula,
123
  },
@@ -132,30 +117,19 @@ def _compute_confidence(
132
  neural_loaded: bool = True,
133
  ) -> dict:
134
  """
135
- Estimate prediction confidence on three signals:
136
- 1. boundary_dist β€” distance of the fused score from a risk-level boundary
137
- 2. agreement β€” how closely neural and symbolic agree
138
- 3. rule_strength β€” how many symbolic rules fired
139
-
140
- Returns:
141
- {
142
- "level": "Low" | "Medium" | "High",
143
- "score": float,
144
- "factors": { "boundary_dist": float, "agreement": float,
145
- "rule_strength": float },
146
- }
147
  """
148
- # Distance from the nearest decision boundary (0.33 or 0.66)
149
- boundary_dist = min(abs(fused - 0.33), abs(fused - 0.66))
150
- dist_factor = min(boundary_dist / 0.18, 1.0)
151
 
152
- # Neural vs symbolic agreement (only meaningful if neural is loaded)
153
  if neural_loaded:
154
  agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)
155
  else:
156
- agree_factor = 0.5 # neutral when neural is unavailable
157
 
158
- # Rule signal β€” more triggers = stronger deterministic evidence
159
  if num_triggered == 0: rule_factor = 0.40
160
  elif num_triggered == 1: rule_factor = 0.70
161
  else: rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)
 
1
  # inference.py
2
+ # Hardening v5.3:
3
+ # - Neural-dominant fusion (default 0.75 / 0.25, IP 0.60 / 0.40)
4
+ # - Symbolic floor of 0.30 REMOVED
5
+ # - Risk-level thresholds: Low < 0.50, Medium 0.50–0.80, High > 0.80
6
+ # - Confidence recalibrated for the new thresholds
7
+ # - RULE_FEATURE_DEPS updated for tightened ICA_007
8
 
9
  from __future__ import annotations
10
 
 
11
  IP_CLAUSE_TYPES = {
12
  "IP Ownership Assignment", "Joint IP Ownership",
13
  "Irrevocable Or Perpetual License",
14
  "Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
15
  }
16
 
17
+ # Rule -> feature dependencies. Used by the explanation engine to surface
18
+ # matched evidence per rule (no lambda introspection required).
 
19
  RULE_FEATURE_DEPS = {
20
  "ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
21
  "ICA_002": ["unilateral_termination", "notice_period_defined"],
 
23
  "ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
24
  "ICA_005": ["is_wagering_clause"],
25
  "ICA_006": ["restrains_legal_proceedings"],
26
+ # ICA_007 tightened: now requires has_uncapped_signal too
27
+ "ICA_007": ["has_indemnity_clause", "indemnity_capped", "has_uncapped_signal"],
28
  "ICA_008": ["has_auto_renewal", "has_opt_out_window"],
29
  "ICA_009": ["has_arbitration", "arbitration_distant_venue"],
30
  "ICA_010": ["has_exclusivity", "exclusivity_term_defined"],
 
37
  "CPA_001": ["is_consumer_contract", "has_one_sided_clause"],
38
  }
39
 
40
+ # ── Risk-level thresholds (single source of truth) ──────────────────────────
41
+ RISK_LOW_MAX = 0.50 # < 0.50 β†’ Low
42
+ RISK_MEDIUM_MAX = 0.80 # 0.50–0.80 β†’ Medium; > 0.80 β†’ High
43
+
44
+
45
+ def level_from_score(score: float) -> tuple[str, str]:
46
+ """Return (level_label, emoji) for a fused score under the v5.3 thresholds."""
47
+ if score < RISK_LOW_MAX:
48
+ return "Low", "🟒"
49
+ if score <= RISK_MEDIUM_MAX:
50
+ return "Medium", "🟑"
51
+ return "High", "πŸ”΄"
52
+
53
 
54
  def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
55
+ """Evaluate symbolic rules. Score is clamped to [0, 1]."""
 
 
 
 
 
 
 
56
  triggered, total = [], 0.0
57
  for rule in symbolic_rules:
58
  try:
 
60
  triggered.append(rule)
61
  total += rule["penalty"]
62
  except Exception:
 
63
  pass
64
  return {
65
  "symbolic_score": round(min(total, 1.0), 3),
 
73
  is_ip_clause: bool = False,
74
  ) -> dict:
75
  """
76
+ Weighted fusion β€” neural-dominant by design.
77
+ No artificial floor: a weak symbolic trigger no longer inflates risk.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  """
79
+ # Default neural-dominant. IP clauses give a bit more weight to symbolic,
80
+ # but symbolic NEVER outweighs neural.
81
  if is_ip_clause and symbolic > 0:
 
 
82
  w_n, w_s = 0.60, 0.40
83
+ else:
84
+ w_n, w_s = 0.75, 0.25
85
 
86
  raw = w_n * neural + w_s * symbolic
87
+ score = round(min(max(raw, 0.0), 1.0), 3)
 
 
88
 
89
+ level, emoji = level_from_score(score)
 
 
90
 
91
  formula = (
92
  f"({w_n:.2f} Γ— {neural:.3f}) + ({w_s:.2f} Γ— {symbolic:.3f}) "
93
  f"= {round(raw, 3)}"
 
94
  )
95
 
96
  return {
 
102
  "symbolic_score": round(symbolic, 3),
103
  "weights": {"neural": w_n, "symbolic": w_s},
104
  "raw_fused": round(raw, 3),
105
+ "floor_applied": False, # retained for UI compat; always False now
106
  "final": score,
107
  "formula": formula,
108
  },
 
117
  neural_loaded: bool = True,
118
  ) -> dict:
119
  """
120
+ Three-factor confidence calibrated for the new thresholds.
121
+ boundary_dist – distance from the nearest risk-level cutoff (0.50, 0.80)
122
+ agreement – 1 - |neural - symbolic| (only when neural is loaded)
123
+ rule_strength – more triggered rules β‡’ stronger deterministic evidence
 
 
 
 
 
 
 
 
124
  """
125
+ boundary_dist = min(abs(fused - RISK_LOW_MAX), abs(fused - RISK_MEDIUM_MAX))
126
+ dist_factor = min(boundary_dist / 0.20, 1.0)
 
127
 
 
128
  if neural_loaded:
129
  agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)
130
  else:
131
+ agree_factor = 0.5
132
 
 
133
  if num_triggered == 0: rule_factor = 0.40
134
  elif num_triggered == 1: rule_factor = 0.70
135
  else: rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)
local_interpreters.py CHANGED
@@ -1,10 +1,10 @@
1
  # local_interpreters.py
2
- # Lightweight local explainability:
3
- # β€’ LIME text explanations (capped at 50 perturbations for CPU sanity)
4
- # β€’ Longformer attention extraction (one extra forward pass, runs only
5
- # when a clause is opened β€” never during batch analysis)
6
 
7
  from __future__ import annotations
 
8
  import time
9
  import numpy as np
10
  import torch
@@ -16,15 +16,65 @@ except Exception:
16
  _LIME_AVAILABLE = False
17
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  class LocalExplainer:
20
- """
21
- LIME + attention extraction.
22
-
23
- Methods:
24
- β€’ explain_with_lime(text, predict_fn) -> list[{word, weight}]
25
- β€’ get_attention_map(text, model, tokenizer) -> list[{token, weight}]
26
- """
27
-
28
  def __init__(self, num_samples: int = 50, timeout_seconds: float = 30.0):
29
  self.num_samples = num_samples
30
  self.timeout_seconds = timeout_seconds
@@ -44,38 +94,43 @@ class LocalExplainer:
44
  self,
45
  text: str,
46
  predict_fn,
47
- num_features: int = 10,
 
48
  ) -> list[dict]:
49
  """
50
- predict_fn(list_of_texts) -> np.ndarray shape (n_texts, 2)
51
- column 0 = "safe" prob (1 - risk_score)
52
- column 1 = "risky" prob (risk_score)
53
- Returns a sorted list of {word, weight} (positive = pushes toward risky).
54
  """
55
  if not _LIME_AVAILABLE or self._lime is None:
56
  return []
57
  try:
58
  t0 = time.time()
59
  exp = self._lime.explain_instance(
60
- text_instance=text[:1500], # keep LIME bounded
61
  classifier_fn=predict_fn,
62
  num_features=num_features,
63
  num_samples=self.num_samples,
64
  labels=(1,),
65
  )
66
  elapsed = time.time() - t0
67
- print(f"[INFO] LIME completed in {elapsed:.1f}s "
68
- f"(samples={self.num_samples}, features={num_features})")
69
- return [
70
- {"word": w, "weight": round(float(s), 4)}
71
- for w, s in exp.as_list(label=1)
72
  ]
 
 
 
 
 
 
73
  except Exception as e:
74
  self.last_error = str(e)
75
  print(f"[WARN] LIME failed: {e}")
76
  return []
77
 
78
- # ── Attention ──────────────────────────────────────────────────────────
79
  def get_attention_map(
80
  self,
81
  text: str,
@@ -84,12 +139,6 @@ class LocalExplainer:
84
  max_length: int = 256,
85
  top_k: int = 30,
86
  ) -> list[dict]:
87
- """
88
- Extract Longformer last-layer global attention from CLS over the
89
- sequence, average over heads, return top_k tokens by weight.
90
- Runs a second forward pass with output_attentions=True (memory cost
91
- is bounded because we only do this for a single clause on demand).
92
- """
93
  if model is None or tokenizer is None:
94
  return []
95
  try:
@@ -100,8 +149,6 @@ class LocalExplainer:
100
  input_ids = enc["input_ids"]
101
  attention_mask = enc["attention_mask"]
102
 
103
- # Force the CLS token (position 0) to use global attention so we
104
- # get a proper distribution over the whole sequence.
105
  global_mask = torch.zeros_like(input_ids)
106
  global_mask[:, 0] = 1
107
 
@@ -113,20 +160,16 @@ class LocalExplainer:
113
  output_attentions = True,
114
  )
115
 
116
- # Longformer exposes global_attentions when global tokens exist.
117
- # Shape: tuple of (batch, num_heads, num_global_tokens, seq_len)
118
  if not getattr(out, "global_attentions", None):
119
  return []
120
 
121
- last_global = out.global_attentions[-1] # last layer
122
- cls_attn = last_global[0, :, 0, :].mean(dim=0) # avg heads
123
 
124
- # Trim to real (non-pad) tokens
125
- n_real = int(attention_mask[0].sum().item())
126
- tokens = tokenizer.convert_ids_to_tokens(input_ids[0][:n_real])
127
  weights = cls_attn[:n_real].cpu().numpy()
128
 
129
- # Skip special tokens for the ranking, but keep them in the sequence
130
  specials = {tokenizer.cls_token, tokenizer.sep_token,
131
  tokenizer.pad_token, tokenizer.bos_token,
132
  tokenizer.eos_token, tokenizer.unk_token}
@@ -145,15 +188,11 @@ class LocalExplainer:
145
  return []
146
 
147
 
148
- # ── Helper: build a LIME-compatible predict_fn from ModelManager ────────────
 
 
149
  def build_predict_fn_for_manager(manager, max_length: int = 256):
150
- """
151
- Returns a callable LIME can use: list[str] -> np.ndarray (n, 2).
152
- Uses the manager's model + tokenizer. Falls back to symbolic if the
153
- neural model isn't loaded (each text just gets [1-sym, sym]).
154
- """
155
  def predict_fn(texts: list[str]) -> np.ndarray:
156
- # Symbolic-only fallback path
157
  if not (manager.is_ready and manager.model is not None):
158
  from inference import _symbolic_rule_score
159
  from app import SYMBOLIC_RULES
@@ -164,7 +203,6 @@ def build_predict_fn_for_manager(manager, max_length: int = 256):
164
  probs.append([1.0 - sym, sym])
165
  return np.array(probs, dtype=np.float32)
166
 
167
- # Neural path β€” batch through Longformer
168
  enc = manager.tokenizer(
169
  list(texts),
170
  padding="max_length", truncation=True,
 
1
  # local_interpreters.py
2
+ # v5.3 β€” LIME output filtered for legal interpretability.
3
+ # The neural model still sees ORIGINAL text. Filtering happens at the
4
+ # display layer only.
 
5
 
6
  from __future__ import annotations
7
+ import re
8
  import time
9
  import numpy as np
10
  import torch
 
16
  _LIME_AVAILABLE = False
17
 
18
 
19
+ # ─────────────────────────────────────────────────────────────────────────────
20
+ # Token filtering β€” display-time only
21
+ # ─────────────────────────────────────────────────────────────────────────────
22
+ LIME_STOPWORDS = {
23
+ "a", "an", "the",
24
+ "of", "in", "on", "at", "to", "for", "by", "with", "from",
25
+ "and", "or", "but",
26
+ "this", "that", "these", "those",
27
+ "it", "its",
28
+ "be", "is", "are", "was", "were", "been", "being",
29
+ "have", "has", "had", "do", "does", "did",
30
+ "as", "if", "so", "than", "then",
31
+ "any", "all", "such", "no", # ambiguous but mostly noise here
32
+ "i", "we", "you", "they", "he", "she",
33
+ }
34
+
35
+ # Legal modal / operative words β€” never filter these even if they look small
36
+ LIME_KEEP_LEGAL = {
37
+ "shall", "may", "must", "not", "only", "unless", "except", "without",
38
+ "subject", "liable", "liability", "indemnify", "indemnity", "indemnification",
39
+ "terminate", "termination", "exclusive", "exclusively", "exclusivity",
40
+ "warrant", "warranty", "breach", "obligation", "covenant", "license",
41
+ "licence", "damages", "consent", "notice", "renew", "renewal",
42
+ "arbitration", "arbitrator", "jurisdiction", "wager", "gambling",
43
+ "assign", "assignment", "limit", "cap", "uncapped", "unlimited",
44
+ "confidential", "disclose", "non-compete", "non-solicit",
45
+ }
46
+
47
+ # Roman numeral regex (used for filtering things like "IV", "xii")
48
+ _ROMAN_NUMERAL = re.compile(r"^[ivxlcdm]+\.?$", re.IGNORECASE)
49
+
50
+
51
+ def _clean_token(raw: str) -> str:
52
+ """Strip leading/trailing punctuation; return lowercased core."""
53
+ return re.sub(r'^[^\w]+|[^\w]+$', '', raw).lower()
54
+
55
+
56
+ def _is_useful_lime_token(raw: str) -> bool:
57
+ core = _clean_token(raw)
58
+ if not core:
59
+ return False
60
+ if core in LIME_KEEP_LEGAL:
61
+ return True
62
+ if core in LIME_STOPWORDS:
63
+ return False
64
+ if core.isdigit():
65
+ return False
66
+ if _ROMAN_NUMERAL.fullmatch(core):
67
+ return False
68
+ # Need at least 2 alphanumeric chars to be a meaningful word
69
+ if sum(c.isalnum() for c in core) < 2:
70
+ return False
71
+ return True
72
+
73
+
74
+ # ─────────────────────────────────────────────────────────────────────────────
75
+ # LocalExplainer
76
+ # ─────────────────────────────────────────────────────────────────────────────
77
  class LocalExplainer:
 
 
 
 
 
 
 
 
78
  def __init__(self, num_samples: int = 50, timeout_seconds: float = 30.0):
79
  self.num_samples = num_samples
80
  self.timeout_seconds = timeout_seconds
 
94
  self,
95
  text: str,
96
  predict_fn,
97
+ num_features: int = 25, # raw β€” over-request, filter later
98
+ display_count: int = 12,
99
  ) -> list[dict]:
100
  """
101
+ Returns up to `display_count` filtered token contributions, sorted by
102
+ absolute weight. The model itself still sees the original full text.
 
 
103
  """
104
  if not _LIME_AVAILABLE or self._lime is None:
105
  return []
106
  try:
107
  t0 = time.time()
108
  exp = self._lime.explain_instance(
109
+ text_instance=text[:1500],
110
  classifier_fn=predict_fn,
111
  num_features=num_features,
112
  num_samples=self.num_samples,
113
  labels=(1,),
114
  )
115
  elapsed = time.time() - t0
116
+ raw_pairs = exp.as_list(label=1)
117
+
118
+ # Filter for legal interpretability
119
+ filtered: list[tuple[str, float]] = [
120
+ (w, float(s)) for w, s in raw_pairs if _is_useful_lime_token(w)
121
  ]
122
+ filtered.sort(key=lambda x: abs(x[1]), reverse=True)
123
+ top = filtered[:display_count]
124
+
125
+ print(f"[INFO] LIME {elapsed:.1f}s, raw={len(raw_pairs)}, "
126
+ f"filtered={len(filtered)}, displayed={len(top)}")
127
+ return [{"word": w, "weight": round(s, 4)} for w, s in top]
128
  except Exception as e:
129
  self.last_error = str(e)
130
  print(f"[WARN] LIME failed: {e}")
131
  return []
132
 
133
+ # ── Attention (unchanged) ──────────────────────────────────────────────
134
  def get_attention_map(
135
  self,
136
  text: str,
 
139
  max_length: int = 256,
140
  top_k: int = 30,
141
  ) -> list[dict]:
 
 
 
 
 
 
142
  if model is None or tokenizer is None:
143
  return []
144
  try:
 
149
  input_ids = enc["input_ids"]
150
  attention_mask = enc["attention_mask"]
151
 
 
 
152
  global_mask = torch.zeros_like(input_ids)
153
  global_mask[:, 0] = 1
154
 
 
160
  output_attentions = True,
161
  )
162
 
 
 
163
  if not getattr(out, "global_attentions", None):
164
  return []
165
 
166
+ last_global = out.global_attentions[-1]
167
+ cls_attn = last_global[0, :, 0, :].mean(dim=0)
168
 
169
+ n_real = int(attention_mask[0].sum().item())
170
+ tokens = tokenizer.convert_ids_to_tokens(input_ids[0][:n_real])
 
171
  weights = cls_attn[:n_real].cpu().numpy()
172
 
 
173
  specials = {tokenizer.cls_token, tokenizer.sep_token,
174
  tokenizer.pad_token, tokenizer.bos_token,
175
  tokenizer.eos_token, tokenizer.unk_token}
 
188
  return []
189
 
190
 
191
+ # ─────────────────────────────────────────────────────────────────────────────
192
+ # Predict-fn factory (unchanged)
193
+ # ─────────────────────────────────────────────────────────────────────────────
194
  def build_predict_fn_for_manager(manager, max_length: int = 256):
 
 
 
 
 
195
  def predict_fn(texts: list[str]) -> np.ndarray:
 
196
  if not (manager.is_ready and manager.model is not None):
197
  from inference import _symbolic_rule_score
198
  from app import SYMBOLIC_RULES
 
203
  probs.append([1.0 - sym, sym])
204
  return np.array(probs, dtype=np.float32)
205
 
 
206
  enc = manager.tokenizer(
207
  list(texts),
208
  padding="max_length", truncation=True,
pdf_utils.py CHANGED
@@ -1,157 +1,188 @@
1
  # pdf_utils.py
2
- # PDF text extraction + advanced clause segmentation for ClauseXplain
 
 
 
3
 
4
  from __future__ import annotations
5
  import re
6
 
7
 
8
- def extract_text_from_pdf(file_path: str) -> str:
9
- """
10
- Extract and clean text from all pages of a PDF using PyMuPDF.
11
- """
12
- import fitz # pymupdf
13
-
14
- doc = fitz.open(file_path)
15
- pages = []
16
 
17
- for page in doc:
18
- text = page.get_text("text")
19
- pages.append(text)
20
 
 
 
 
 
 
 
 
21
  doc.close()
22
-
23
  raw = "\n".join(pages)
24
-
25
- # Normalize whitespace while preserving structure
26
- raw = re.sub(r"\r\n", "\n", raw)
27
- raw = re.sub(r"[ \t]+", " ", raw)
28
- raw = re.sub(r"\n{3,}", "\n\n", raw)
29
-
30
  return raw.strip()
31
 
32
 
33
- def split_into_clauses(text: str, min_length: int = 40) -> list[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  """
35
- Backward-compatible clause splitter.
36
-
37
- Returns:
38
- list[str]
39
  """
40
- clauses = split_into_clauses_with_metadata(text, min_length)
41
- return [c["text"] for c in clauses]
 
42
 
 
43
 
44
- def split_into_clauses_with_metadata(text: str, min_length: int = 40) -> list[dict]:
45
- """
46
- Advanced legal clause segmentation with metadata.
 
 
 
 
 
47
 
48
- Detects:
49
- 1. Numbered clauses β†’ 1., 1.1, 2.3.4
50
- 2. Articles / Sections β†’ Article 5, Section 7
51
- 3. Lettered clauses β†’ (a), (b), (c)
52
- 4. ALL CAPS headings β†’ GOVERNING LAW
53
- 5. Paragraph fallback
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- Returns:
56
- [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  {
58
- "text": "...",
59
- "number": "1.2",
60
- "kind": "numbered"
61
  }
 
62
  ]
63
- """
64
-
65
- if not text or not text.strip():
66
- return []
67
-
68
- text = text.strip()
69
 
70
- numbered_pattern = re.compile(
71
- r'(?m)^(?=\s*(?:\d+(?:\.\d+)*\.?|Article\s+\d+|Section\s+\d+))',
72
- re.IGNORECASE
73
- )
74
 
75
- lettered_pattern = re.compile(
76
- r'(?m)^(?=\s*\([a-z]\))'
77
- )
78
-
79
- caps_pattern = re.compile(
80
- r'(?m)^(?=\s*[A-Z][A-Z\s]{4,}$)'
81
- )
82
-
83
- def extract_clause_number(clause: str, kind: str):
84
- if kind == "numbered":
85
- match = re.match(
86
- r'^\s*(\d+(?:\.\d+)*\.?|Article\s+\d+|Section\s+\d+)',
87
- clause,
88
- re.IGNORECASE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  )
90
- return match.group(1) if match else None
91
-
92
- elif kind == "lettered":
93
- match = re.match(r'^\s*(\([a-z]\))', clause)
94
- return match.group(1) if match else None
95
-
96
- return None
97
-
98
- def build_metadata(parts: list[str], kind: str) -> list[dict]:
99
- results = []
100
 
101
- for part in parts:
102
- clause = part.strip()
 
 
103
 
104
- if len(clause) < min_length:
105
- continue
106
 
107
- results.append({
108
- "text": clause,
109
- "number": extract_clause_number(clause, kind),
110
- "kind": kind
111
- })
112
-
113
- return results
114
-
115
- # Strategy 1: numbered clauses
116
- numbered_parts = [p.strip() for p in numbered_pattern.split(text) if p.strip()]
117
- if len(numbered_parts) > 1:
118
- result = build_metadata(numbered_parts, "numbered")
119
- if len(result) > 1:
120
- return result
121
-
122
- # Strategy 2: lettered clauses
123
- lettered_parts = [p.strip() for p in lettered_pattern.split(text) if p.strip()]
124
- if len(lettered_parts) > 1:
125
- result = build_metadata(lettered_parts, "lettered")
126
- if len(result) > 1:
127
- return result
128
-
129
- # Strategy 3: ALL CAPS headings
130
- caps_parts = [p.strip() for p in caps_pattern.split(text) if p.strip()]
131
- if len(caps_parts) > 1:
132
- result = build_metadata(caps_parts, "caps_heading")
133
- if len(result) > 1:
134
- return result
135
-
136
- # Strategy 4: paragraph fallback
137
- paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
138
-
139
- fallback = []
140
-
141
- for para in paragraphs:
142
- if len(para) >= min_length:
143
- fallback.append({
144
- "text": para,
145
- "number": None,
146
- "kind": "paragraph"
147
- })
148
-
149
- # Final fallback: whole document as one clause
150
- if not fallback and len(text) >= min_length:
151
- fallback.append({
152
- "text": text,
153
- "number": None,
154
- "kind": "full_text"
155
- })
156
 
157
- return fallback
 
 
 
1
  # pdf_utils.py
2
+ # v5.3 β€” finer-grained clause segmentation.
3
+ # Adds:
4
+ # β€’ Inline subclause splitting for long clauses (a), (b), (c), (i), (ii) …
5
+ # β€’ Hard length cap with sentence-boundary fallback
6
 
7
  from __future__ import annotations
8
  import re
9
 
10
 
11
+ LONG_CLAUSE_CHARS = 1200
12
+ MAX_CLAUSE_CHARS = 3000
13
+ MIN_SUBCLAUSE_LEN = 60
 
 
 
 
 
14
 
 
 
 
15
 
16
+ # ─────────────────────────────────────────────────────────────────────────────
17
+ # PDF extraction (unchanged)
18
+ # ─────────────────────────────────────────────────────────────────────────────
19
+ def extract_text_from_pdf(file_path: str) -> str:
20
+ import fitz
21
+ doc = fitz.open(file_path)
22
+ pages = [page.get_text("text") for page in doc]
23
  doc.close()
 
24
  raw = "\n".join(pages)
25
+ raw = re.sub(r'\r\n', '\n', raw)
26
+ raw = re.sub(r'[ \t]+', ' ', raw)
27
+ raw = re.sub(r'\n{3,}', '\n\n', raw)
28
+ raw = re.sub(r'(\w)-\n(\w)', r'\1\2', raw)
 
 
29
  return raw.strip()
30
 
31
 
32
+ # ─────────────────────────────────────────────────────────────────────────────
33
+ # Header detection (primary segmentation)
34
+ # ─────────────────────────────────────────────────────────────────────────────
35
+ _HEADER_PATTERNS: list[tuple[str, re.Pattern]] = [
36
+ ("decimal", re.compile(r'(?m)^\s*(\d+(?:\.\d+){0,3}\.?)\s+(?=\S)')),
37
+ ("article", re.compile(
38
+ r'(?m)^\s*((?:Article|Section|Clause|Schedule|Annexure|Annex|Appendix|Part|Chapter)'
39
+ r'\s+(?:\d+(?:\.\d+){0,2}|[IVXLC]+))[\s\.\-:]', re.IGNORECASE)),
40
+ ("lettered", re.compile(r'(?m)^\s*(\(\s*[a-zA-Z]{1,4}\s*\))\s+(?=\S)')),
41
+ ("roman", re.compile(r'(?m)^\s*([IVX]{1,5}\.)\s+(?=\S)')),
42
+ ("caps", re.compile(r'(?m)^([A-Z][A-Z0-9 &/\-]{4,59})\s*$')),
43
+ ]
44
+
45
+ # Inline subclause markers β€” used in the SECOND pass (mid-text, not line-start)
46
+ _INLINE_SUBCLAUSE = re.compile(
47
+ r'(?<=[\s\.\;\:])(\(\s*(?:[a-z]|[ivx]{1,4})\s*\))\s+(?=[A-Z\w])',
48
+ re.IGNORECASE,
49
+ )
50
+
51
+
52
+ def _collect_headers(text: str) -> list[tuple[int, str, str]]:
53
+ hits: list[tuple[int, str, str]] = []
54
+ for kind, pat in _HEADER_PATTERNS:
55
+ for m in pat.finditer(text):
56
+ hits.append((m.start(1), m.group(1).strip(), kind))
57
+ hits.sort(key=lambda h: h[0])
58
+
59
+ deduped: list[tuple[int, str, str]] = []
60
+ for h in hits:
61
+ if not deduped or abs(h[0] - deduped[-1][0]) > 2:
62
+ deduped.append(h)
63
+ return deduped
64
+
65
+
66
+ # ─────────────────────────────────────────────────────────────────────────────
67
+ # Inline subclause post-processing
68
+ # ─────────────────────────────────────────────────────────────────────────────
69
+ def _split_inline_subclauses(
70
+ body: str,
71
+ parent_number: str | None = None,
72
+ min_length: int = MIN_SUBCLAUSE_LEN,
73
+ ) -> list[dict]:
74
  """
75
+ If the clause body contains β‰₯ 2 inline subclause markers, split it.
76
+ Subclause numbers are prefixed with the parent (e.g. "5.7" + "(a)" β†’ "5.7(a)").
77
+ Returns [] if no useful split is possible (caller keeps the original).
 
78
  """
79
+ matches = list(_INLINE_SUBCLAUSE.finditer(body))
80
+ if len(matches) < 2:
81
+ return []
82
 
83
+ parts: list[dict] = []
84
 
85
+ # First chunk: text before the first marker (usually the parent header line)
86
+ head = body[:matches[0].start()].strip()
87
+ if head and len(head) >= 30:
88
+ parts.append({
89
+ "text": head,
90
+ "number": parent_number,
91
+ "kind": "decimal" if parent_number else "paragraph",
92
+ })
93
 
94
+ for i, m in enumerate(matches):
95
+ start = m.start()
96
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
97
+ chunk = body[start:end].strip()
98
+ if len(chunk) < min_length:
99
+ # Too short to be a real subclause β€” fold into previous
100
+ if parts:
101
+ parts[-1]["text"] = (parts[-1]["text"] + "\n" + chunk).strip()
102
+ continue
103
+
104
+ sub_marker = m.group(1).strip()
105
+ composite = f"{parent_number}{sub_marker}" if parent_number else sub_marker
106
+ parts.append({
107
+ "text": chunk,
108
+ "number": composite,
109
+ "kind": "subclause",
110
+ })
111
 
112
+ return parts
113
+
114
+
115
+ def _hard_cap_split(clause: dict, max_len: int = MAX_CLAUSE_CHARS) -> list[dict]:
116
+ """If still too long, split on sentence boundaries to bound LM input."""
117
+ body = clause["text"]
118
+ if len(body) <= max_len:
119
+ return [clause]
120
+ sentences = re.split(r'(?<=[\.\?\!])\s+(?=[A-Z])', body)
121
+ chunks, current = [], ""
122
+ for s in sentences:
123
+ if len(current) + len(s) + 1 > max_len and current:
124
+ chunks.append(current.strip())
125
+ current = s
126
+ else:
127
+ current = (current + " " + s).strip() if current else s
128
+ if current:
129
+ chunks.append(current.strip())
130
+
131
+ return [
132
  {
133
+ "text": c,
134
+ "number": clause.get("number"),
135
+ "kind": clause.get("kind", "paragraph") + "/chunked",
136
  }
137
+ for c in chunks if len(c) >= MIN_SUBCLAUSE_LEN
138
  ]
 
 
 
 
 
 
139
 
 
 
 
 
140
 
141
+ # ─────────────────────────────────────────────────────────────────────────────
142
+ # Public API
143
+ # ─────────────────────────────────────────────────────────────────────────────
144
+ def split_into_clauses_with_metadata(
145
+ text: str,
146
+ min_length: int = 40,
147
+ ) -> list[dict]:
148
+ headers = _collect_headers(text)
149
+
150
+ # ── Primary segmentation (heading-based) ───────────────────────────────
151
+ primary: list[dict] = []
152
+ if headers:
153
+ for i, (start, marker, kind) in enumerate(headers):
154
+ end = headers[i + 1][0] if i + 1 < len(headers) else len(text)
155
+ body = text[start:end].strip()
156
+ if len(body) >= min_length:
157
+ primary.append({"text": body, "number": marker, "kind": kind})
158
+
159
+ # Paragraph fallback when no headers were found
160
+ if not primary:
161
+ for p in [p.strip() for p in re.split(r'\n\s*\n', text)]:
162
+ if len(p) >= min_length:
163
+ primary.append({"text": p, "number": None, "kind": "paragraph"})
164
+
165
+ # ── Secondary pass: inline subclause splitting for long clauses ────────
166
+ refined: list[dict] = []
167
+ for clause in primary:
168
+ if len(clause["text"]) > LONG_CLAUSE_CHARS:
169
+ subs = _split_inline_subclauses(
170
+ clause["text"],
171
+ parent_number=clause.get("number"),
172
  )
173
+ if subs:
174
+ refined.extend(subs)
175
+ continue
176
+ refined.append(clause)
 
 
 
 
 
 
177
 
178
+ # ── Tertiary pass: hard length cap (sentence-boundary chunking) ────────
179
+ final: list[dict] = []
180
+ for clause in refined:
181
+ final.extend(_hard_cap_split(clause))
182
 
183
+ return final
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ def split_into_clauses(text: str, min_length: int = 40) -> list[str]:
187
+ """Backward-compat wrapper that returns plain strings."""
188
+ return [c["text"] for c in split_into_clauses_with_metadata(text, min_length)]