Spaces:
Sleeping
Sleeping
Explanability and Symbolic part fixed 12th may
Browse files- app.py +40 -68
- explanation.py +13 -53
- feature_extractor.py +125 -107
- inference.py +40 -66
- local_interpreters.py +86 -48
- pdf_utils.py +159 -128
app.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
| 1 |
# app.py
|
| 2 |
-
# ClauseXplain v5.
|
| 3 |
-
#
|
| 4 |
-
#
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
import os
|
|
@@ -18,10 +21,10 @@ from transformers import LongformerTokenizer, LongformerModel
|
|
| 18 |
from sklearn.preprocessing import MultiLabelBinarizer
|
| 19 |
from huggingface_hub import hf_hub_download
|
| 20 |
|
| 21 |
-
# ββ New modules βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
from feature_extractor import ClauseFeatureExtractor
|
| 23 |
from explanation import generate_explanation
|
| 24 |
-
from
|
|
|
|
| 25 |
|
| 26 |
# ββ Optional / fail-soft integrations βββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
try:
|
|
@@ -52,10 +55,8 @@ except Exception as _e:
|
|
| 52 |
print(f"[WARN] report disabled: {_e}")
|
| 53 |
generate_report = None
|
| 54 |
|
| 55 |
-
# ββ Device β always CPU on HF free tier βββββββββββββββββββββββββββββββββββββββ
|
| 56 |
DEVICE = torch.device("cpu")
|
| 57 |
|
| 58 |
-
# ββ Label sets (unchanged) ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 59 |
CLAUSE_CLASSES = [
|
| 60 |
"Cap On Liability", "Change Of Control", "Covenant Not To Sue",
|
| 61 |
"Exclusivity", "Governing Law", "IP Ownership Assignment",
|
|
@@ -76,7 +77,7 @@ CLAUSE_CLASSES = [
|
|
| 76 |
RISK_CLASSES = ["ambiguity", "enforceability", "financial", "ip", "structural"]
|
| 77 |
|
| 78 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
-
# Symbolic rules β
|
| 80 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
SYMBOLIC_RULES = [
|
| 82 |
{"rule_id": "ICA_001", "name": "Unconscionable Liability Cap",
|
|
@@ -91,21 +92,28 @@ SYMBOLIC_RULES = [
|
|
| 91 |
{"rule_id": "ICA_004", "name": "Penalty Clause Exceeds Actual Damage",
|
| 92 |
"reference": "Indian Contract Act 1872, S.74", "penalty": 0.40, "category": "financial",
|
| 93 |
"condition": lambda f: f.get("has_liquidated_damages") and f.get("damages_exceed_loss")},
|
| 94 |
-
|
|
|
|
| 95 |
"reference": "Indian Contract Act 1872, S.30", "penalty": 0.70, "category": "enforceability",
|
| 96 |
"condition": lambda f: f.get("is_wagering_clause")},
|
| 97 |
{"rule_id": "ICA_006", "name": "Restraint of Legal Proceedings",
|
| 98 |
"reference": "Indian Contract Act 1872, S.28", "penalty": 0.60, "category": "enforceability",
|
| 99 |
"condition": lambda f: f.get("restrains_legal_proceedings")},
|
|
|
|
| 100 |
{"rule_id": "ICA_007", "name": "Uncapped Indemnity Obligation",
|
| 101 |
"reference": "Indian Contract Act 1872, S.124", "penalty": 0.50, "category": "financial",
|
| 102 |
-
"condition": lambda f:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
{"rule_id": "ICA_008", "name": "Auto-Renewal Without Opt-Out Window",
|
| 104 |
"reference": "Indian Contract Act 1872 + CPA 2019", "penalty": 0.35, "category": "enforceability",
|
| 105 |
"condition": lambda f: f.get("has_auto_renewal") and not f.get("has_opt_out_window")},
|
| 106 |
{"rule_id": "ICA_009", "name": "Arbitration in Distant Venue",
|
| 107 |
"reference": "Arbitration and Conciliation Act 1996, S.20", "penalty": 0.40, "category": "enforceability",
|
| 108 |
"condition": lambda f: f.get("has_arbitration") and f.get("arbitration_distant_venue")},
|
|
|
|
| 109 |
{"rule_id": "ICA_010", "name": "Indefinite Exclusivity",
|
| 110 |
"reference": "Indian Contract Act 1872, S.27", "penalty": 0.50, "category": "enforceability",
|
| 111 |
"condition": lambda f: f.get("has_exclusivity") and not f.get("exclusivity_term_defined")},
|
|
@@ -134,7 +142,7 @@ SYMBOLIC_RULES = [
|
|
| 134 |
|
| 135 |
|
| 136 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 137 |
-
# Model
|
| 138 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 139 |
class ClauseXplainV5(nn.Module):
|
| 140 |
def __init__(self, num_clause_labels: int, num_risk_labels: int):
|
|
@@ -211,7 +219,6 @@ class ModelManager:
|
|
| 211 |
repo_id="riyasuryawanshi746/clauseXplain",
|
| 212 |
filename="clausexplain_v5_best.pt",
|
| 213 |
)
|
| 214 |
-
print(f"[INFO] Checkpoint at: {ckpt_path}")
|
| 215 |
checkpoint = torch.load(
|
| 216 |
ckpt_path,
|
| 217 |
map_location=torch.device("cpu"),
|
|
@@ -301,7 +308,7 @@ class ModelManager:
|
|
| 301 |
"top_risk_cats": top_risks,
|
| 302 |
"triggered_rules": triggered_clean,
|
| 303 |
"features": {k: v for k, v in features.items() if v},
|
| 304 |
-
"evidence": evidence,
|
| 305 |
"score_breakdown": fusion["breakdown"],
|
| 306 |
"confidence": confidence,
|
| 307 |
}
|
|
@@ -333,9 +340,8 @@ class ModelManager:
|
|
| 333 |
|
| 334 |
scores = [r["risk_score"] for r in results]
|
| 335 |
overall = round(0.70 * max(scores) + 0.30 * (sum(scores) / len(scores)), 3)
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
else: level = "High"
|
| 339 |
return {
|
| 340 |
"overall_risk": overall,
|
| 341 |
"overall_level": level,
|
|
@@ -348,7 +354,7 @@ class ModelManager:
|
|
| 348 |
manager = ModelManager()
|
| 349 |
|
| 350 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 351 |
-
# UI helpers
|
| 352 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 353 |
LEVEL_COLOR = {"Low": "π’", "Medium": "π‘", "High": "π΄"}
|
| 354 |
LEVEL_HEX = {"Low": "#10b981", "Medium": "#f59e0b", "High": "#ef4444"}
|
|
@@ -412,8 +418,6 @@ def _score_breakdown_html(breakdown) -> str:
|
|
| 412 |
if not breakdown:
|
| 413 |
return ""
|
| 414 |
w = breakdown["weights"]
|
| 415 |
-
floor_note = ('<div class="cx-bd-floor">β Floor 0.30 applied β symbolic rules fired</div>'
|
| 416 |
-
if breakdown.get("floor_applied") else "")
|
| 417 |
return f"""
|
| 418 |
<div class="cx-breakdown">
|
| 419 |
<div class="cx-bd-row">
|
|
@@ -432,12 +436,11 @@ def _score_breakdown_html(breakdown) -> str:
|
|
| 432 |
<span class="cx-bd-k">Final</span>
|
| 433 |
<span class="cx-bd-final-v">{breakdown['final']:.3f}</span>
|
| 434 |
</div>
|
| 435 |
-
{floor_note}
|
| 436 |
</div>"""
|
| 437 |
|
| 438 |
|
| 439 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 440 |
-
# Analysis flow
|
| 441 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 442 |
def _run_analysis(text: str):
|
| 443 |
if not text or len(text.strip()) < 30:
|
|
@@ -505,7 +508,6 @@ def _build_outputs(text: str):
|
|
| 505 |
</div>
|
| 506 |
{model_note}"""
|
| 507 |
|
| 508 |
-
# Top cards
|
| 509 |
top_parts = ['<div class="cx-section-title">π₯ Top Risk Clauses</div>',
|
| 510 |
'<div class="cx-top-grid">']
|
| 511 |
for r in doc["top_risks"]:
|
|
@@ -537,7 +539,6 @@ def _build_outputs(text: str):
|
|
| 537 |
top_parts.append("</div>")
|
| 538 |
top_html = "\n".join(top_parts)
|
| 539 |
|
| 540 |
-
# Markdown breakdown table
|
| 541 |
rows = [
|
| 542 |
"## π All Clauses\n",
|
| 543 |
"| # | Marker | Level | Score | Confidence | Symbolic | Preview |",
|
|
@@ -565,14 +566,10 @@ def _build_outputs(text: str):
|
|
| 565 |
f"{r['clause_text'][:55].replace(chr(10), ' ')}β¦"
|
| 566 |
for r in doc["clauses"]
|
| 567 |
]
|
| 568 |
-
# PDF download button: visible only after a successful analysis
|
| 569 |
pdf_update = gr.update(visible=True, value=None)
|
| 570 |
return summary_html, top_html, breakdown_md, gr.update(choices=clause_choices, value=None), doc, pdf_update
|
| 571 |
|
| 572 |
|
| 573 |
-
# ββββββββββββββββοΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 574 |
-
# Clause explanation panel β lazy-runs Gemini + LIME + attention here
|
| 575 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 576 |
def show_clause_explanation(choice: str, doc_state: dict):
|
| 577 |
if not choice or not doc_state:
|
| 578 |
return '<div class="cx-empty">β Select a clause above to see its full legal analysis.</div>'
|
|
@@ -584,27 +581,22 @@ def show_clause_explanation(choice: str, doc_state: dict):
|
|
| 584 |
|
| 585 |
explanation = generate_explanation(r["clause_text"], r)
|
| 586 |
|
| 587 |
-
# ββ Gemini summary (cached) ββββββββββββββββββββββββββββββββββββββββββββ
|
| 588 |
if nl_summarizer is not None:
|
| 589 |
nl_text = nl_summarizer.generate_summary(explanation, r["clause_text"])
|
| 590 |
explanation["natural_language_summary"] = nl_text
|
| 591 |
-
# Persist so the PDF report can include it
|
| 592 |
r["nl_summary"] = nl_text
|
| 593 |
else:
|
| 594 |
explanation["natural_language_summary"] = ""
|
| 595 |
|
| 596 |
-
# ββ LIME (lazy, bounded) βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 597 |
lime_words = []
|
| 598 |
if local_explainer is not None and build_predict_fn_for_manager is not None:
|
| 599 |
try:
|
| 600 |
manager.ensure_loaded()
|
| 601 |
predict_fn = build_predict_fn_for_manager(manager)
|
| 602 |
-
lime_words = local_explainer.explain_with_lime(r["clause_text"], predict_fn
|
| 603 |
-
num_features=10)
|
| 604 |
except Exception as e:
|
| 605 |
print(f"[WARN] LIME path failed: {e}")
|
| 606 |
|
| 607 |
-
# ββ Attention map (lazy, bounded) ββββββββββββββββββββββββββββββββββββββ
|
| 608 |
attn_tokens = []
|
| 609 |
if local_explainer is not None and manager.is_ready:
|
| 610 |
try:
|
|
@@ -614,7 +606,6 @@ def show_clause_explanation(choice: str, doc_state: dict):
|
|
| 614 |
except Exception as e:
|
| 615 |
print(f"[WARN] Attention path failed: {e}")
|
| 616 |
|
| 617 |
-
# ββ Render βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 618 |
lvl = r["risk_level_raw"]
|
| 619 |
color = LEVEL_HEX.get(lvl, "#6b7280")
|
| 620 |
cpct = int(r["risk_score"] * 100)
|
|
@@ -631,7 +622,6 @@ def show_clause_explanation(choice: str, doc_state: dict):
|
|
| 631 |
f'<div class="cx-pills">{evidence_pills}</div>'
|
| 632 |
) if evidence_pills else ""
|
| 633 |
|
| 634 |
-
# NL summary block
|
| 635 |
nl_block = ""
|
| 636 |
nl_text = explanation.get("natural_language_summary", "").strip()
|
| 637 |
if nl_text:
|
|
@@ -642,22 +632,19 @@ def show_clause_explanation(choice: str, doc_state: dict):
|
|
| 642 |
f'<div class="cx-nl">{nl_text}</div>'
|
| 643 |
)
|
| 644 |
|
| 645 |
-
# LIME + attention blocks
|
| 646 |
lime_block = ""
|
| 647 |
if lime_words:
|
| 648 |
-
lime_block = (f'<div class="cx-section-label">π§ͺ LIME β
|
| 649 |
f'{lime_html(lime_words)}')
|
| 650 |
attn_block = ""
|
| 651 |
if attn_tokens:
|
| 652 |
attn_block = (f'<div class="cx-section-label">ποΈ Attention Heatmap</div>'
|
| 653 |
f'{attention_heatmap_html(attn_tokens)}')
|
| 654 |
|
| 655 |
-
# Score-breakdown plaintext (Riya's "Final Score = ..." string)
|
| 656 |
bd_text_block = ""
|
| 657 |
if explanation.get("score_breakdown_text"):
|
| 658 |
bd_text_block = f'<div class="cx-bd-text">{explanation["score_breakdown_text"]}</div>'
|
| 659 |
|
| 660 |
-
# Per-rule cards
|
| 661 |
rules_html = ""
|
| 662 |
for rule_data in explanation.get("rules") or []:
|
| 663 |
rid = rule_data["rule_id"]
|
|
@@ -725,9 +712,6 @@ def show_clause_explanation(choice: str, doc_state: dict):
|
|
| 725 |
</div>"""
|
| 726 |
|
| 727 |
|
| 728 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 729 |
-
# PDF report download
|
| 730 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 731 |
def build_pdf_report(doc_state: dict):
|
| 732 |
if not doc_state:
|
| 733 |
return gr.update(visible=False, value=None)
|
|
@@ -744,7 +728,6 @@ def build_pdf_report(doc_state: dict):
|
|
| 744 |
return gr.update(visible=True, value=None)
|
| 745 |
|
| 746 |
|
| 747 |
-
# ββ Examples ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 748 |
EXAMPLES = [
|
| 749 |
("β‘ High Risk", """1. Liability Cap
|
| 750 |
The total liability of either party shall not exceed Rs. 50,000 under any circumstances,
|
|
@@ -752,10 +735,7 @@ including gross negligence or wilful misconduct of either party.
|
|
| 752 |
2. Non-Compete
|
| 753 |
Employee shall not compete with the company in any capacity for 3 years following
|
| 754 |
termination of this agreement, within the territory of India.
|
| 755 |
-
3.
|
| 756 |
-
The vendor shall collect and process customer personal data as required to fulfil
|
| 757 |
-
the services described in Schedule A of this agreement.
|
| 758 |
-
4. Indemnity
|
| 759 |
The Service Provider shall indemnify and hold harmless the Client against any and all
|
| 760 |
claims, damages, losses, and expenses arising out of or related to this agreement."""),
|
| 761 |
|
|
@@ -766,40 +746,35 @@ Any dispute arising out of this agreement shall be referred to arbitration with
|
|
| 766 |
the seat of arbitration in Singapore.
|
| 767 |
3. Pricing
|
| 768 |
The Company may modify the prices and fees charged under this agreement at
|
| 769 |
-
its sole discretion
|
| 770 |
|
| 771 |
("π’ Low Risk", """1. Renewal
|
| 772 |
This agreement renews automatically every year unless either party provides
|
| 773 |
30 days written notice before the renewal date.
|
| 774 |
2. Governing Law
|
| 775 |
This agreement is governed by the laws of India."""),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
]
|
| 777 |
|
| 778 |
|
| 779 |
-
# ββ CSS (additive over v5.1) ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 780 |
CUSTOM_CSS = """
|
| 781 |
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,400&family=DM+Mono:wght@400;500&display=swap');
|
| 782 |
* { box-sizing: border-box; }
|
| 783 |
body, .gradio-container { background:#080d1a !important; font-family:'DM Sans',sans-serif !important; color:#e2e8f0 !important; }
|
| 784 |
footer { display:none !important; }
|
| 785 |
.gradio-container { max-width:1080px !important; margin:0 auto !important; }
|
| 786 |
-
.cx-hero { text-align:center; padding:52px 24px 36px;
|
| 787 |
-
|
| 788 |
-
border-radius:16px; margin-bottom:8px; position:relative; overflow:hidden; }
|
| 789 |
-
.cx-hero::before { content:''; position:absolute; inset:0;
|
| 790 |
-
background:radial-gradient(ellipse 70% 60% at 50% -10%,#6366f135 0%,transparent 70%);
|
| 791 |
-
pointer-events:none; }
|
| 792 |
.cx-hero-icon { font-size:44px; margin-bottom:14px; }
|
| 793 |
-
.cx-hero-title { font-size:38px; font-weight:700; letter-spacing:-.025em;
|
| 794 |
-
background:linear-gradient(135deg,#f1f5f9 20%,#a5b4fc 80%);
|
| 795 |
-
-webkit-background-clip:text; -webkit-text-fill-color:transparent;
|
| 796 |
-
margin:0 0 10px; line-height:1.1; }
|
| 797 |
.cx-hero-sub { font-size:15px; color:#94a3b8; margin:0 0 22px; font-weight:400; }
|
| 798 |
.cx-badges { display:flex; gap:8px; flex-wrap:wrap; justify-content:center; }
|
| 799 |
-
.cx-badge-hero { font-size:11px; font-weight:600; letter-spacing:.07em; text-transform:uppercase;
|
| 800 |
-
|
| 801 |
-
.cx-model-notice { background:#111827; border:1px solid #1e293b; border-radius:10px;
|
| 802 |
-
padding:11px 16px; font-size:13px; color:#94a3b8; display:flex; align-items:center; gap:10px; margin-bottom:4px; }
|
| 803 |
.cx-model-notice strong { color:#a5b4fc; }
|
| 804 |
.cx-card { background:#111827; border:1px solid #1e293b; border-radius:12px; padding:16px 20px; }
|
| 805 |
.cx-summary-grid { display:grid; grid-template-columns:180px 1fr 1fr 1fr 1fr; gap:12px; align-items:stretch; margin:4px 0 8px; }
|
|
@@ -836,7 +811,6 @@ footer { display:none !important; }
|
|
| 836 |
.cx-bd-formula { color:#a5b4fc; font-size:13px; padding:4px 0 8px; }
|
| 837 |
.cx-bd-final { display:grid; grid-template-columns:80px 1fr; padding-top:4px; }
|
| 838 |
.cx-bd-final-v { color:#34d399; font-weight:700; font-size:16px; }
|
| 839 |
-
.cx-bd-floor { margin-top:8px; font-size:11px; color:#fbbf24; background:#422006; padding:6px 10px; border-radius:6px; font-family:'DM Sans',sans-serif; }
|
| 840 |
.cx-bd-text { font-family:'DM Mono',monospace; font-size:12px; color:#94a3b8; padding:6px 14px; }
|
| 841 |
.cx-divider { border:none; border-top:1px solid #1a2332; margin:24px 0; }
|
| 842 |
.cx-empty { color:#374151; font-size:14px; padding:28px 0; text-align:center; }
|
|
@@ -880,7 +854,6 @@ select, .gr-dropdown { background:#0c1525 !important; border-color:#1e293b !impo
|
|
| 880 |
"""
|
| 881 |
|
| 882 |
|
| 883 |
-
# ββ Build UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 884 |
def build_ui():
|
| 885 |
with gr.Blocks(
|
| 886 |
title="ClauseXplain β AI Legal Risk Dashboard",
|
|
@@ -898,7 +871,7 @@ def build_ui():
|
|
| 898 |
<div class="cx-hero">
|
| 899 |
<div class="cx-hero-icon">βοΈ</div>
|
| 900 |
<h1 class="cx-hero-title">ClauseXplain</h1>
|
| 901 |
-
<p class="cx-hero-sub">
|
| 902 |
<div class="cx-badges">
|
| 903 |
<span class="cx-badge-hero">ICA 1872</span>
|
| 904 |
<span class="cx-badge-hero">DPDPA 2023</span>
|
|
@@ -913,7 +886,7 @@ def build_ui():
|
|
| 913 |
gr.HTML("""
|
| 914 |
<div class="cx-model-notice">
|
| 915 |
β³ The neural model (~2 GB) loads on your <strong>first analysis request</strong> β
|
| 916 |
-
expect 60β90 s. LIME + attention run lazily when you
|
| 917 |
</div>
|
| 918 |
""")
|
| 919 |
|
|
@@ -944,7 +917,6 @@ def build_ui():
|
|
| 944 |
with gr.Accordion("π Full Clause Breakdown", open=False):
|
| 945 |
breakdown_out = gr.Markdown("")
|
| 946 |
|
| 947 |
-
# ββ PDF download ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 948 |
with gr.Row():
|
| 949 |
pdf_dl_btn = gr.Button("π₯ Download PDF Report", variant="primary")
|
| 950 |
pdf_file_out = gr.File(label="Compliance Report", visible=False, interactive=False)
|
|
|
|
| 1 |
# app.py
|
| 2 |
+
# ClauseXplain v5.3 β hardening pass
|
| 3 |
+
# Changes vs v5.2:
|
| 4 |
+
# β’ ICA_007 (uncapped indemnity) now requires has_uncapped_signal β no more
|
| 5 |
+
# auto-firing on every "indemnify" mention
|
| 6 |
+
# β’ analyze_document uses level_from_score() from inference.py (single source
|
| 7 |
+
# of truth for the new 0.50 / 0.80 risk-level cutoffs)
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
import os
|
|
|
|
| 21 |
from sklearn.preprocessing import MultiLabelBinarizer
|
| 22 |
from huggingface_hub import hf_hub_download
|
| 23 |
|
|
|
|
| 24 |
from feature_extractor import ClauseFeatureExtractor
|
| 25 |
from explanation import generate_explanation
|
| 26 |
+
from inference import level_from_score # v5.3: single source of truth
|
| 27 |
+
from utils import highlight_keywords
|
| 28 |
|
| 29 |
# ββ Optional / fail-soft integrations βββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
try:
|
|
|
|
| 55 |
print(f"[WARN] report disabled: {_e}")
|
| 56 |
generate_report = None
|
| 57 |
|
|
|
|
| 58 |
DEVICE = torch.device("cpu")
|
| 59 |
|
|
|
|
| 60 |
CLAUSE_CLASSES = [
|
| 61 |
"Cap On Liability", "Change Of Control", "Covenant Not To Sue",
|
| 62 |
"Exclusivity", "Governing Law", "IP Ownership Assignment",
|
|
|
|
| 77 |
RISK_CLASSES = ["ambiguity", "enforceability", "financial", "ip", "structural"]
|
| 78 |
|
| 79 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 80 |
+
# Symbolic rules β v5.3 tightened
|
| 81 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 82 |
SYMBOLIC_RULES = [
|
| 83 |
{"rule_id": "ICA_001", "name": "Unconscionable Liability Cap",
|
|
|
|
| 92 |
{"rule_id": "ICA_004", "name": "Penalty Clause Exceeds Actual Damage",
|
| 93 |
"reference": "Indian Contract Act 1872, S.74", "penalty": 0.40, "category": "financial",
|
| 94 |
"condition": lambda f: f.get("has_liquidated_damages") and f.get("damages_exceed_loss")},
|
| 95 |
+
# ICA_005: only fires on explicit gambling vocab β no more "contingent on closing"
|
| 96 |
+
{"rule_id": "ICA_005", "name": "Wagering / Gambling Agreement",
|
| 97 |
"reference": "Indian Contract Act 1872, S.30", "penalty": 0.70, "category": "enforceability",
|
| 98 |
"condition": lambda f: f.get("is_wagering_clause")},
|
| 99 |
{"rule_id": "ICA_006", "name": "Restraint of Legal Proceedings",
|
| 100 |
"reference": "Indian Contract Act 1872, S.28", "penalty": 0.60, "category": "enforceability",
|
| 101 |
"condition": lambda f: f.get("restrains_legal_proceedings")},
|
| 102 |
+
# ICA_007 TIGHTENED: indemnity + explicit uncapped signal + no cap
|
| 103 |
{"rule_id": "ICA_007", "name": "Uncapped Indemnity Obligation",
|
| 104 |
"reference": "Indian Contract Act 1872, S.124", "penalty": 0.50, "category": "financial",
|
| 105 |
+
"condition": lambda f: (
|
| 106 |
+
f.get("has_indemnity_clause")
|
| 107 |
+
and f.get("has_uncapped_signal")
|
| 108 |
+
and not f.get("indemnity_capped")
|
| 109 |
+
)},
|
| 110 |
{"rule_id": "ICA_008", "name": "Auto-Renewal Without Opt-Out Window",
|
| 111 |
"reference": "Indian Contract Act 1872 + CPA 2019", "penalty": 0.35, "category": "enforceability",
|
| 112 |
"condition": lambda f: f.get("has_auto_renewal") and not f.get("has_opt_out_window")},
|
| 113 |
{"rule_id": "ICA_009", "name": "Arbitration in Distant Venue",
|
| 114 |
"reference": "Arbitration and Conciliation Act 1996, S.20", "penalty": 0.40, "category": "enforceability",
|
| 115 |
"condition": lambda f: f.get("has_arbitration") and f.get("arbitration_distant_venue")},
|
| 116 |
+
# ICA_010 narrowed via tightened has_exclusivity patterns in feature_extractor
|
| 117 |
{"rule_id": "ICA_010", "name": "Indefinite Exclusivity",
|
| 118 |
"reference": "Indian Contract Act 1872, S.27", "penalty": 0.50, "category": "enforceability",
|
| 119 |
"condition": lambda f: f.get("has_exclusivity") and not f.get("exclusivity_term_defined")},
|
|
|
|
| 142 |
|
| 143 |
|
| 144 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 145 |
+
# Model (unchanged)
|
| 146 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 147 |
class ClauseXplainV5(nn.Module):
|
| 148 |
def __init__(self, num_clause_labels: int, num_risk_labels: int):
|
|
|
|
| 219 |
repo_id="riyasuryawanshi746/clauseXplain",
|
| 220 |
filename="clausexplain_v5_best.pt",
|
| 221 |
)
|
|
|
|
| 222 |
checkpoint = torch.load(
|
| 223 |
ckpt_path,
|
| 224 |
map_location=torch.device("cpu"),
|
|
|
|
| 308 |
"top_risk_cats": top_risks,
|
| 309 |
"triggered_rules": triggered_clean,
|
| 310 |
"features": {k: v for k, v in features.items() if v},
|
| 311 |
+
"evidence": evidence,
|
| 312 |
"score_breakdown": fusion["breakdown"],
|
| 313 |
"confidence": confidence,
|
| 314 |
}
|
|
|
|
| 340 |
|
| 341 |
scores = [r["risk_score"] for r in results]
|
| 342 |
overall = round(0.70 * max(scores) + 0.30 * (sum(scores) / len(scores)), 3)
|
| 343 |
+
# v5.3: single source of truth for thresholds
|
| 344 |
+
level, _ = level_from_score(overall)
|
|
|
|
| 345 |
return {
|
| 346 |
"overall_risk": overall,
|
| 347 |
"overall_level": level,
|
|
|
|
| 354 |
manager = ModelManager()
|
| 355 |
|
| 356 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 357 |
+
# UI helpers (unchanged from v5.2)
|
| 358 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 359 |
LEVEL_COLOR = {"Low": "π’", "Medium": "π‘", "High": "π΄"}
|
| 360 |
LEVEL_HEX = {"Low": "#10b981", "Medium": "#f59e0b", "High": "#ef4444"}
|
|
|
|
| 418 |
if not breakdown:
|
| 419 |
return ""
|
| 420 |
w = breakdown["weights"]
|
|
|
|
|
|
|
| 421 |
return f"""
|
| 422 |
<div class="cx-breakdown">
|
| 423 |
<div class="cx-bd-row">
|
|
|
|
| 436 |
<span class="cx-bd-k">Final</span>
|
| 437 |
<span class="cx-bd-final-v">{breakdown['final']:.3f}</span>
|
| 438 |
</div>
|
|
|
|
| 439 |
</div>"""
|
| 440 |
|
| 441 |
|
| 442 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 443 |
+
# Analysis flow (unchanged structurally)
|
| 444 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 445 |
def _run_analysis(text: str):
|
| 446 |
if not text or len(text.strip()) < 30:
|
|
|
|
| 508 |
</div>
|
| 509 |
{model_note}"""
|
| 510 |
|
|
|
|
| 511 |
top_parts = ['<div class="cx-section-title">π₯ Top Risk Clauses</div>',
|
| 512 |
'<div class="cx-top-grid">']
|
| 513 |
for r in doc["top_risks"]:
|
|
|
|
| 539 |
top_parts.append("</div>")
|
| 540 |
top_html = "\n".join(top_parts)
|
| 541 |
|
|
|
|
| 542 |
rows = [
|
| 543 |
"## π All Clauses\n",
|
| 544 |
"| # | Marker | Level | Score | Confidence | Symbolic | Preview |",
|
|
|
|
| 566 |
f"{r['clause_text'][:55].replace(chr(10), ' ')}β¦"
|
| 567 |
for r in doc["clauses"]
|
| 568 |
]
|
|
|
|
| 569 |
pdf_update = gr.update(visible=True, value=None)
|
| 570 |
return summary_html, top_html, breakdown_md, gr.update(choices=clause_choices, value=None), doc, pdf_update
|
| 571 |
|
| 572 |
|
|
|
|
|
|
|
|
|
|
| 573 |
def show_clause_explanation(choice: str, doc_state: dict):
|
| 574 |
if not choice or not doc_state:
|
| 575 |
return '<div class="cx-empty">β Select a clause above to see its full legal analysis.</div>'
|
|
|
|
| 581 |
|
| 582 |
explanation = generate_explanation(r["clause_text"], r)
|
| 583 |
|
|
|
|
| 584 |
if nl_summarizer is not None:
|
| 585 |
nl_text = nl_summarizer.generate_summary(explanation, r["clause_text"])
|
| 586 |
explanation["natural_language_summary"] = nl_text
|
|
|
|
| 587 |
r["nl_summary"] = nl_text
|
| 588 |
else:
|
| 589 |
explanation["natural_language_summary"] = ""
|
| 590 |
|
|
|
|
| 591 |
lime_words = []
|
| 592 |
if local_explainer is not None and build_predict_fn_for_manager is not None:
|
| 593 |
try:
|
| 594 |
manager.ensure_loaded()
|
| 595 |
predict_fn = build_predict_fn_for_manager(manager)
|
| 596 |
+
lime_words = local_explainer.explain_with_lime(r["clause_text"], predict_fn)
|
|
|
|
| 597 |
except Exception as e:
|
| 598 |
print(f"[WARN] LIME path failed: {e}")
|
| 599 |
|
|
|
|
| 600 |
attn_tokens = []
|
| 601 |
if local_explainer is not None and manager.is_ready:
|
| 602 |
try:
|
|
|
|
| 606 |
except Exception as e:
|
| 607 |
print(f"[WARN] Attention path failed: {e}")
|
| 608 |
|
|
|
|
| 609 |
lvl = r["risk_level_raw"]
|
| 610 |
color = LEVEL_HEX.get(lvl, "#6b7280")
|
| 611 |
cpct = int(r["risk_score"] * 100)
|
|
|
|
| 622 |
f'<div class="cx-pills">{evidence_pills}</div>'
|
| 623 |
) if evidence_pills else ""
|
| 624 |
|
|
|
|
| 625 |
nl_block = ""
|
| 626 |
nl_text = explanation.get("natural_language_summary", "").strip()
|
| 627 |
if nl_text:
|
|
|
|
| 632 |
f'<div class="cx-nl">{nl_text}</div>'
|
| 633 |
)
|
| 634 |
|
|
|
|
| 635 |
lime_block = ""
|
| 636 |
if lime_words:
|
| 637 |
+
lime_block = (f'<div class="cx-section-label">π§ͺ LIME β Key Legal Terms Driving Risk</div>'
|
| 638 |
f'{lime_html(lime_words)}')
|
| 639 |
attn_block = ""
|
| 640 |
if attn_tokens:
|
| 641 |
attn_block = (f'<div class="cx-section-label">ποΈ Attention Heatmap</div>'
|
| 642 |
f'{attention_heatmap_html(attn_tokens)}')
|
| 643 |
|
|
|
|
| 644 |
bd_text_block = ""
|
| 645 |
if explanation.get("score_breakdown_text"):
|
| 646 |
bd_text_block = f'<div class="cx-bd-text">{explanation["score_breakdown_text"]}</div>'
|
| 647 |
|
|
|
|
| 648 |
rules_html = ""
|
| 649 |
for rule_data in explanation.get("rules") or []:
|
| 650 |
rid = rule_data["rule_id"]
|
|
|
|
| 712 |
</div>"""
|
| 713 |
|
| 714 |
|
|
|
|
|
|
|
|
|
|
| 715 |
def build_pdf_report(doc_state: dict):
|
| 716 |
if not doc_state:
|
| 717 |
return gr.update(visible=False, value=None)
|
|
|
|
| 728 |
return gr.update(visible=True, value=None)
|
| 729 |
|
| 730 |
|
|
|
|
| 731 |
EXAMPLES = [
|
| 732 |
("β‘ High Risk", """1. Liability Cap
|
| 733 |
The total liability of either party shall not exceed Rs. 50,000 under any circumstances,
|
|
|
|
| 735 |
2. Non-Compete
|
| 736 |
Employee shall not compete with the company in any capacity for 3 years following
|
| 737 |
termination of this agreement, within the territory of India.
|
| 738 |
+
3. Indemnity
|
|
|
|
|
|
|
|
|
|
| 739 |
The Service Provider shall indemnify and hold harmless the Client against any and all
|
| 740 |
claims, damages, losses, and expenses arising out of or related to this agreement."""),
|
| 741 |
|
|
|
|
| 746 |
the seat of arbitration in Singapore.
|
| 747 |
3. Pricing
|
| 748 |
The Company may modify the prices and fees charged under this agreement at
|
| 749 |
+
its sole discretion to modify the terms upon written notice."""),
|
| 750 |
|
| 751 |
("π’ Low Risk", """1. Renewal
|
| 752 |
This agreement renews automatically every year unless either party provides
|
| 753 |
30 days written notice before the renewal date.
|
| 754 |
2. Governing Law
|
| 755 |
This agreement is governed by the laws of India."""),
|
| 756 |
+
|
| 757 |
+
("π§ͺ Benign (M&A-style)", """Compensation paid hereunder shall be exclusive of the Company's
|
| 758 |
+
contributions to statutory benefits. Payment of the closing bonus is
|
| 759 |
+
contingent on the occurrence of the closing of the merger transaction
|
| 760 |
+
and continued employment through such date."""),
|
| 761 |
]
|
| 762 |
|
| 763 |
|
|
|
|
| 764 |
CUSTOM_CSS = """
|
| 765 |
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,300;0,9..40,400;0,9..40,500;0,9..40,600;0,9..40,700;1,9..40,400&family=DM+Mono:wght@400;500&display=swap');
|
| 766 |
* { box-sizing: border-box; }
|
| 767 |
body, .gradio-container { background:#080d1a !important; font-family:'DM Sans',sans-serif !important; color:#e2e8f0 !important; }
|
| 768 |
footer { display:none !important; }
|
| 769 |
.gradio-container { max-width:1080px !important; margin:0 auto !important; }
|
| 770 |
+
.cx-hero { text-align:center; padding:52px 24px 36px; background:linear-gradient(135deg,#0f172a 0%,#1a1040 60%,#0f172a 100%); border-radius:16px; margin-bottom:8px; position:relative; overflow:hidden; }
|
| 771 |
+
.cx-hero::before { content:''; position:absolute; inset:0; background:radial-gradient(ellipse 70% 60% at 50% -10%,#6366f135 0%,transparent 70%); pointer-events:none; }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
.cx-hero-icon { font-size:44px; margin-bottom:14px; }
|
| 773 |
+
.cx-hero-title { font-size:38px; font-weight:700; letter-spacing:-.025em; background:linear-gradient(135deg,#f1f5f9 20%,#a5b4fc 80%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; margin:0 0 10px; line-height:1.1; }
|
|
|
|
|
|
|
|
|
|
| 774 |
.cx-hero-sub { font-size:15px; color:#94a3b8; margin:0 0 22px; font-weight:400; }
|
| 775 |
.cx-badges { display:flex; gap:8px; flex-wrap:wrap; justify-content:center; }
|
| 776 |
+
.cx-badge-hero { font-size:11px; font-weight:600; letter-spacing:.07em; text-transform:uppercase; padding:5px 12px; border-radius:20px; border:1px solid #2d3a55; background:#131c30; color:#8b9fc7; }
|
| 777 |
+
.cx-model-notice { background:#111827; border:1px solid #1e293b; border-radius:10px; padding:11px 16px; font-size:13px; color:#94a3b8; display:flex; align-items:center; gap:10px; margin-bottom:4px; }
|
|
|
|
|
|
|
| 778 |
.cx-model-notice strong { color:#a5b4fc; }
|
| 779 |
.cx-card { background:#111827; border:1px solid #1e293b; border-radius:12px; padding:16px 20px; }
|
| 780 |
.cx-summary-grid { display:grid; grid-template-columns:180px 1fr 1fr 1fr 1fr; gap:12px; align-items:stretch; margin:4px 0 8px; }
|
|
|
|
| 811 |
.cx-bd-formula { color:#a5b4fc; font-size:13px; padding:4px 0 8px; }
|
| 812 |
.cx-bd-final { display:grid; grid-template-columns:80px 1fr; padding-top:4px; }
|
| 813 |
.cx-bd-final-v { color:#34d399; font-weight:700; font-size:16px; }
|
|
|
|
| 814 |
.cx-bd-text { font-family:'DM Mono',monospace; font-size:12px; color:#94a3b8; padding:6px 14px; }
|
| 815 |
.cx-divider { border:none; border-top:1px solid #1a2332; margin:24px 0; }
|
| 816 |
.cx-empty { color:#374151; font-size:14px; padding:28px 0; text-align:center; }
|
|
|
|
| 854 |
"""
|
| 855 |
|
| 856 |
|
|
|
|
| 857 |
def build_ui():
|
| 858 |
with gr.Blocks(
|
| 859 |
title="ClauseXplain β AI Legal Risk Dashboard",
|
|
|
|
| 871 |
<div class="cx-hero">
|
| 872 |
<div class="cx-hero-icon">βοΈ</div>
|
| 873 |
<h1 class="cx-hero-title">ClauseXplain</h1>
|
| 874 |
+
<p class="cx-hero-sub">International contract neural backbone, localised via Indian neuro-symbolic legal reasoning</p>
|
| 875 |
<div class="cx-badges">
|
| 876 |
<span class="cx-badge-hero">ICA 1872</span>
|
| 877 |
<span class="cx-badge-hero">DPDPA 2023</span>
|
|
|
|
| 886 |
gr.HTML("""
|
| 887 |
<div class="cx-model-notice">
|
| 888 |
β³ The neural model (~2 GB) loads on your <strong>first analysis request</strong> β
|
| 889 |
+
expect 60β90 s. Per-clause LIME + attention run lazily when you inspect a clause (~15β25 s).
|
| 890 |
</div>
|
| 891 |
""")
|
| 892 |
|
|
|
|
| 917 |
with gr.Accordion("π Full Clause Breakdown", open=False):
|
| 918 |
breakdown_out = gr.Markdown("")
|
| 919 |
|
|
|
|
| 920 |
with gr.Row():
|
| 921 |
pdf_dl_btn = gr.Button("π₯ Download PDF Report", variant="primary")
|
| 922 |
pdf_file_out = gr.File(label="Compliance Report", visible=False, interactive=False)
|
explanation.py
CHANGED
|
@@ -1,15 +1,10 @@
|
|
| 1 |
# explanation.py
|
| 2 |
-
#
|
| 3 |
-
# confidence_level, natural_language_summary placeholder, and a formatted
|
| 4 |
-
# score_breakdown_text.
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
from inference import RULE_FEATURE_DEPS
|
| 8 |
|
| 9 |
|
| 10 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
-
# Per-rule explanations (kept identical to v5.1)
|
| 12 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
RULE_EXPLANATIONS = {
|
| 14 |
"ICA_001": {
|
| 15 |
"why": "Liability is capped even for gross negligence or wilful misconduct.",
|
|
@@ -32,9 +27,9 @@ RULE_EXPLANATIONS = {
|
|
| 32 |
"suggestion": "Link the damages figure to a genuine pre-estimate of foreseeable loss.",
|
| 33 |
},
|
| 34 |
"ICA_005": {
|
| 35 |
-
"why": "The clause
|
| 36 |
"meaning": "Such agreements are void under Indian Contract Act S.30.",
|
| 37 |
-
"suggestion": "Remove or restructure the
|
| 38 |
},
|
| 39 |
"ICA_006": {
|
| 40 |
"why": "The clause restricts a party from pursuing legal proceedings.",
|
|
@@ -42,8 +37,8 @@ RULE_EXPLANATIONS = {
|
|
| 42 |
"suggestion": "Replace with a structured dispute-resolution mechanism (arbitration / mediation).",
|
| 43 |
},
|
| 44 |
"ICA_007": {
|
| 45 |
-
"why": "An indemnity obligation is
|
| 46 |
-
"meaning": "You could face
|
| 47 |
"suggestion": "Cap the indemnity at a multiple of contract value and carve out consequential losses.",
|
| 48 |
},
|
| 49 |
"ICA_008": {
|
|
@@ -57,7 +52,7 @@ RULE_EXPLANATIONS = {
|
|
| 57 |
"suggestion": "Set the seat of arbitration in a neutral, accessible Indian city (e.g. Mumbai, Delhi).",
|
| 58 |
},
|
| 59 |
"ICA_010": {
|
| 60 |
-
"why": "Exclusivity
|
| 61 |
"meaning": "Indefinite restraints of trade are typically void under Indian Contract Act S.27.",
|
| 62 |
"suggestion": "Fix a clear exclusivity term (e.g. 1-3 years) with defined renewal mechanics.",
|
| 63 |
},
|
|
@@ -98,9 +93,6 @@ RULE_EXPLANATIONS = {
|
|
| 98 |
},
|
| 99 |
}
|
| 100 |
|
| 101 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
-
# Risk-level prose
|
| 103 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 104 |
RISK_CONTEXT = {
|
| 105 |
"Low": "This clause appears relatively standard with minimal legal exposure.",
|
| 106 |
"Medium": "This clause contains terms that warrant careful review before signing.",
|
|
@@ -117,9 +109,6 @@ CATEGORY_CONTEXT = {
|
|
| 117 |
}
|
| 118 |
|
| 119 |
|
| 120 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 121 |
-
# Helpers
|
| 122 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 123 |
def _evidence_for_rule(rule_id: str, evidence: dict) -> list[dict]:
|
| 124 |
deps = RULE_FEATURE_DEPS.get(rule_id, [])
|
| 125 |
snippets: list[dict] = []
|
|
@@ -134,7 +123,6 @@ def _evidence_for_rule(rule_id: str, evidence: dict) -> list[dict]:
|
|
| 134 |
|
| 135 |
|
| 136 |
def _flat_evidence(evidence: dict) -> list[dict]:
|
| 137 |
-
"""Riya's Prompt 1 evidence shape."""
|
| 138 |
out = []
|
| 139 |
for feat, hits in (evidence or {}).items():
|
| 140 |
for h in hits:
|
|
@@ -154,36 +142,14 @@ def _format_score_breakdown_text(breakdown: dict | None, fused: float) -> str:
|
|
| 154 |
nrm = breakdown.get("neural_score", 0.0)
|
| 155 |
sym = breakdown.get("symbolic_score", 0.0)
|
| 156 |
fin = breakdown.get("final", fused)
|
| 157 |
-
note = " [floor 0.30 applied]" if breakdown.get("floor_applied") else ""
|
| 158 |
return (
|
| 159 |
f"Final Score = {fin:.2f} "
|
| 160 |
-
f"(Neural {nrm:.2f}
|
| 161 |
-
f"Symbolic {sym:.2f}
|
| 162 |
)
|
| 163 |
|
| 164 |
|
| 165 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
-
# Main entry point
|
| 167 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 168 |
def generate_explanation(text: str, result: dict) -> dict:
|
| 169 |
-
"""
|
| 170 |
-
Returns:
|
| 171 |
-
{
|
| 172 |
-
# ββ Original keys (backward compatible) ββ
|
| 173 |
-
"overview": str,
|
| 174 |
-
"rules": list[dict], # with per-rule evidence
|
| 175 |
-
"general_tip": str,
|
| 176 |
-
"score_breakdown": dict | None,
|
| 177 |
-
"confidence": dict | None,
|
| 178 |
-
|
| 179 |
-
# ββ New keys (Prompt 1) ββ
|
| 180 |
-
"risk_breakdown": dict # neural/symbolic/weights/final
|
| 181 |
-
"evidence": list[dict], # flat list across all features
|
| 182 |
-
"confidence_level": str # "Low"|"Medium"|"High"
|
| 183 |
-
"natural_language_summary": str # filled by NLSummarizer (placeholder here)
|
| 184 |
-
"score_breakdown_text": str # human-readable formula string
|
| 185 |
-
}
|
| 186 |
-
"""
|
| 187 |
level_raw = result.get("risk_level_raw", "Low")
|
| 188 |
triggered = result.get("triggered_rules", [])
|
| 189 |
top_cats = result.get("top_risk_cats", [])
|
|
@@ -192,7 +158,6 @@ def generate_explanation(text: str, result: dict) -> dict:
|
|
| 192 |
breakdown = result.get("score_breakdown")
|
| 193 |
confidence = result.get("confidence") or {}
|
| 194 |
|
| 195 |
-
# ββ Overview sentence ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
primary_cat = top_cats[0][0] if top_cats else "structural"
|
| 197 |
cat_desc = CATEGORY_CONTEXT.get(primary_cat, "legal concerns")
|
| 198 |
overview = (
|
|
@@ -200,7 +165,6 @@ def generate_explanation(text: str, result: dict) -> dict:
|
|
| 200 |
f"The primary concern is {cat_desc} (fused risk score: {risk_score:.2f})."
|
| 201 |
)
|
| 202 |
|
| 203 |
-
# ββ Per-rule explanations ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 204 |
rule_details = []
|
| 205 |
for rule in triggered:
|
| 206 |
rid = rule.get("rule_id", "")
|
|
@@ -217,7 +181,6 @@ def generate_explanation(text: str, result: dict) -> dict:
|
|
| 217 |
"evidence": _evidence_for_rule(rid, evidence),
|
| 218 |
})
|
| 219 |
|
| 220 |
-
# ββ General tip ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 221 |
general_tip = ""
|
| 222 |
if not triggered:
|
| 223 |
if level_raw == "Low":
|
|
@@ -229,31 +192,28 @@ def generate_explanation(text: str, result: dict) -> dict:
|
|
| 229 |
general_tip = ("High neural risk score despite no specific rule triggers. "
|
| 230 |
"The clause may contain broad or one-sided language - seek legal review.")
|
| 231 |
|
| 232 |
-
#
|
| 233 |
risk_breakdown = breakdown or {
|
| 234 |
"neural_score": result.get("neural_score", 0.0),
|
| 235 |
"symbolic_score": result.get("symbolic_score", 0.0),
|
| 236 |
-
"weights": {"neural": 0.
|
| 237 |
"raw_fused": risk_score,
|
| 238 |
"floor_applied": False,
|
| 239 |
"final": risk_score,
|
| 240 |
-
"formula": f"(
|
| 241 |
-
f"(
|
| 242 |
f"= {risk_score:.3f}",
|
| 243 |
}
|
| 244 |
|
| 245 |
return {
|
| 246 |
-
# Original / backward-compatible
|
| 247 |
"overview": overview,
|
| 248 |
"rules": rule_details,
|
| 249 |
"general_tip": general_tip,
|
| 250 |
"score_breakdown": breakdown,
|
| 251 |
"confidence": confidence,
|
| 252 |
-
|
| 253 |
-
# Prompt 1 additions
|
| 254 |
"risk_breakdown": risk_breakdown,
|
| 255 |
"evidence": _flat_evidence(evidence),
|
| 256 |
"confidence_level": confidence.get("level", "Medium"),
|
| 257 |
-
"natural_language_summary": "",
|
| 258 |
"score_breakdown_text": _format_score_breakdown_text(breakdown, risk_score),
|
| 259 |
}
|
|
|
|
| 1 |
# explanation.py
|
| 2 |
+
# v5.3 β fallback risk_breakdown weights aligned to new fusion config.
|
|
|
|
|
|
|
| 3 |
|
| 4 |
from __future__ import annotations
|
| 5 |
from inference import RULE_FEATURE_DEPS
|
| 6 |
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
RULE_EXPLANATIONS = {
|
| 9 |
"ICA_001": {
|
| 10 |
"why": "Liability is capped even for gross negligence or wilful misconduct.",
|
|
|
|
| 27 |
"suggestion": "Link the damages figure to a genuine pre-estimate of foreseeable loss.",
|
| 28 |
},
|
| 29 |
"ICA_005": {
|
| 30 |
+
"why": "The clause uses gambling, wagering, or betting vocabulary.",
|
| 31 |
"meaning": "Such agreements are void under Indian Contract Act S.30.",
|
| 32 |
+
"suggestion": "Remove or restructure the wagering element of this clause.",
|
| 33 |
},
|
| 34 |
"ICA_006": {
|
| 35 |
"why": "The clause restricts a party from pursuing legal proceedings.",
|
|
|
|
| 37 |
"suggestion": "Replace with a structured dispute-resolution mechanism (arbitration / mediation).",
|
| 38 |
},
|
| 39 |
"ICA_007": {
|
| 40 |
+
"why": "An indemnity obligation is paired with uncapped / unlimited liability language.",
|
| 41 |
+
"meaning": "You could face open-ended financial exposure for third-party claims.",
|
| 42 |
"suggestion": "Cap the indemnity at a multiple of contract value and carve out consequential losses.",
|
| 43 |
},
|
| 44 |
"ICA_008": {
|
|
|
|
| 52 |
"suggestion": "Set the seat of arbitration in a neutral, accessible Indian city (e.g. Mumbai, Delhi).",
|
| 53 |
},
|
| 54 |
"ICA_010": {
|
| 55 |
+
"why": "Exclusivity rights are granted without a defined term, making them open-ended.",
|
| 56 |
"meaning": "Indefinite restraints of trade are typically void under Indian Contract Act S.27.",
|
| 57 |
"suggestion": "Fix a clear exclusivity term (e.g. 1-3 years) with defined renewal mechanics.",
|
| 58 |
},
|
|
|
|
| 93 |
},
|
| 94 |
}
|
| 95 |
|
|
|
|
|
|
|
|
|
|
| 96 |
RISK_CONTEXT = {
|
| 97 |
"Low": "This clause appears relatively standard with minimal legal exposure.",
|
| 98 |
"Medium": "This clause contains terms that warrant careful review before signing.",
|
|
|
|
| 109 |
}
|
| 110 |
|
| 111 |
|
|
|
|
|
|
|
|
|
|
| 112 |
def _evidence_for_rule(rule_id: str, evidence: dict) -> list[dict]:
|
| 113 |
deps = RULE_FEATURE_DEPS.get(rule_id, [])
|
| 114 |
snippets: list[dict] = []
|
|
|
|
| 123 |
|
| 124 |
|
| 125 |
def _flat_evidence(evidence: dict) -> list[dict]:
|
|
|
|
| 126 |
out = []
|
| 127 |
for feat, hits in (evidence or {}).items():
|
| 128 |
for h in hits:
|
|
|
|
| 142 |
nrm = breakdown.get("neural_score", 0.0)
|
| 143 |
sym = breakdown.get("symbolic_score", 0.0)
|
| 144 |
fin = breakdown.get("final", fused)
|
|
|
|
| 145 |
return (
|
| 146 |
f"Final Score = {fin:.2f} "
|
| 147 |
+
f"(Neural {nrm:.2f} Γ {w.get('neural', 0):.2f} + "
|
| 148 |
+
f"Symbolic {sym:.2f} Γ {w.get('symbolic', 0):.2f})"
|
| 149 |
)
|
| 150 |
|
| 151 |
|
|
|
|
|
|
|
|
|
|
| 152 |
def generate_explanation(text: str, result: dict) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
level_raw = result.get("risk_level_raw", "Low")
|
| 154 |
triggered = result.get("triggered_rules", [])
|
| 155 |
top_cats = result.get("top_risk_cats", [])
|
|
|
|
| 158 |
breakdown = result.get("score_breakdown")
|
| 159 |
confidence = result.get("confidence") or {}
|
| 160 |
|
|
|
|
| 161 |
primary_cat = top_cats[0][0] if top_cats else "structural"
|
| 162 |
cat_desc = CATEGORY_CONTEXT.get(primary_cat, "legal concerns")
|
| 163 |
overview = (
|
|
|
|
| 165 |
f"The primary concern is {cat_desc} (fused risk score: {risk_score:.2f})."
|
| 166 |
)
|
| 167 |
|
|
|
|
| 168 |
rule_details = []
|
| 169 |
for rule in triggered:
|
| 170 |
rid = rule.get("rule_id", "")
|
|
|
|
| 181 |
"evidence": _evidence_for_rule(rid, evidence),
|
| 182 |
})
|
| 183 |
|
|
|
|
| 184 |
general_tip = ""
|
| 185 |
if not triggered:
|
| 186 |
if level_raw == "Low":
|
|
|
|
| 192 |
general_tip = ("High neural risk score despite no specific rule triggers. "
|
| 193 |
"The clause may contain broad or one-sided language - seek legal review.")
|
| 194 |
|
| 195 |
+
# v5.3: fallback weights updated to new neural-dominant config
|
| 196 |
risk_breakdown = breakdown or {
|
| 197 |
"neural_score": result.get("neural_score", 0.0),
|
| 198 |
"symbolic_score": result.get("symbolic_score", 0.0),
|
| 199 |
+
"weights": {"neural": 0.75, "symbolic": 0.25},
|
| 200 |
"raw_fused": risk_score,
|
| 201 |
"floor_applied": False,
|
| 202 |
"final": risk_score,
|
| 203 |
+
"formula": f"(0.75 Γ {result.get('neural_score', 0):.3f}) + "
|
| 204 |
+
f"(0.25 Γ {result.get('symbolic_score', 0):.3f}) "
|
| 205 |
f"= {risk_score:.3f}",
|
| 206 |
}
|
| 207 |
|
| 208 |
return {
|
|
|
|
| 209 |
"overview": overview,
|
| 210 |
"rules": rule_details,
|
| 211 |
"general_tip": general_tip,
|
| 212 |
"score_breakdown": breakdown,
|
| 213 |
"confidence": confidence,
|
|
|
|
|
|
|
| 214 |
"risk_breakdown": risk_breakdown,
|
| 215 |
"evidence": _flat_evidence(evidence),
|
| 216 |
"confidence_level": confidence.get("level", "Medium"),
|
| 217 |
+
"natural_language_summary": "",
|
| 218 |
"score_breakdown_text": _format_score_breakdown_text(breakdown, risk_score),
|
| 219 |
}
|
feature_extractor.py
CHANGED
|
@@ -1,186 +1,206 @@
|
|
| 1 |
# feature_extractor.py
|
| 2 |
-
#
|
| 3 |
-
#
|
| 4 |
-
#
|
|
|
|
|
|
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
import re
|
| 8 |
|
| 9 |
|
| 10 |
class ClauseFeatureExtractor:
|
| 11 |
-
"""
|
| 12 |
-
Regex-based hybrid extractor.
|
| 13 |
-
|
| 14 |
-
Public API:
|
| 15 |
-
β’ extract(text) -> (features, evidence_dict)
|
| 16 |
-
features: dict[str, bool|int]
|
| 17 |
-
evidence_dict: dict[feature_name, list[hit]]
|
| 18 |
-
hit = {"phrase": str, "span": [start, end], "label": str}
|
| 19 |
-
|
| 20 |
-
β’ extract_unified(text) -> dict
|
| 21 |
-
Returns the format requested in Riya's Prompt 3:
|
| 22 |
-
{
|
| 23 |
-
"<feature_name>": True/False/int,
|
| 24 |
-
...,
|
| 25 |
-
"evidence": [
|
| 26 |
-
{"feature": str, "keywords": [str, ...],
|
| 27 |
-
"evidence_text": str, "span": [start, end]},
|
| 28 |
-
...
|
| 29 |
-
],
|
| 30 |
-
}
|
| 31 |
-
|
| 32 |
-
β’ flatten_evidence(evidence_dict) -> list[dict]
|
| 33 |
-
Convert the nested evidence dict to a flat list of hits
|
| 34 |
-
(one hit per matched phrase).
|
| 35 |
-
"""
|
| 36 |
-
|
| 37 |
BOOLEAN_PATTERNS: dict[str, list[tuple[re.Pattern, str]]] = {
|
|
|
|
| 38 |
"has_liability_cap": [
|
| 39 |
-
(re.compile(r"\bshall\s+not\s+exceed\b", re.I),
|
| 40 |
(re.compile(r"\b(?:maximum|max\.?|total|aggregate|cumulative)\s+liabilit(?:y|ies)\b", re.I), "max liability"),
|
| 41 |
-
(re.compile(r"\bcap(?:ped)?\s+(?:on\s+|of\s+)?liabilit(?:y|ies)\b", re.I),
|
| 42 |
(re.compile(r"\blimited\s+to\s+(?:an?\s+amount|the\s+(?:greater|lesser)|rs\.?|inr|usd|\$|βΉ)", re.I), "limited to (amount)"),
|
| 43 |
-
(re.compile(r"\bliability\s+(?:is\s+|shall\s+be\s+)?limited\b", re.I),
|
| 44 |
],
|
| 45 |
"excludes_gross_negligence": [
|
| 46 |
-
(re.compile(r"\bgross\s+negligen(?:ce|t)\b", re.I),
|
| 47 |
-
(re.compile(r"\bwil?l?ful\s+(?:misconduct|default|breach)\b", re.I),
|
| 48 |
-
(re.compile(r"\bintentional\s+(?:misconduct|breach|wrongdoing|act)\b", re.I),
|
| 49 |
-
(re.compile(r"\brecklessness?\b", re.I),
|
| 50 |
-
(re.compile(r"\bbad\s+faith\b", re.I),
|
| 51 |
-
(re.compile(r"\bfraud(?:ulent)?\b", re.I),
|
| 52 |
],
|
|
|
|
|
|
|
| 53 |
"has_liquidated_damages": [
|
| 54 |
-
(re.compile(r"\bliquidated\s+damages?\b", re.I),
|
| 55 |
(re.compile(r"\bpre[\-\s]?(?:determined|estimated|agreed)\s+damages?\b", re.I), "pre-determined damages"),
|
| 56 |
-
(re.compile(r"\bpenalty\s+(?:amount|sum|of)\b", re.I),
|
| 57 |
],
|
| 58 |
"damages_exceed_loss": [
|
| 59 |
(re.compile(r"\bregardless\s+of\s+(?:actual\s+)?(?:loss|damage|harm)\b", re.I), "regardless of loss"),
|
| 60 |
-
(re.compile(r"\birrespective\s+of\s+(?:actual\s+)?(?:loss|damage)\b", re.I),
|
| 61 |
-
(re.compile(r"\bwithout\s+proof\s+of\s+(?:damage|loss)\b", re.I),
|
| 62 |
-
(re.compile(r"\bpenalty\s+clause\b", re.I),
|
| 63 |
],
|
|
|
|
|
|
|
| 64 |
"is_wagering_clause": [
|
| 65 |
-
(re.compile(r"\
|
| 66 |
-
(re.compile(r"\
|
| 67 |
-
(re.compile(r"\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
],
|
|
|
|
| 69 |
"restrains_legal_proceedings": [
|
| 70 |
(re.compile(r"\bwaive(?:s|d)?\s+(?:the\s+)?right\s+to\s+(?:sue|bring|file|institute)\b", re.I), "waive right to sue"),
|
| 71 |
(re.compile(r"\b(?:shall|will)\s+not\s+(?:bring|file|commence|institute)\s+(?:any\s+)?(?:action|suit|proceeding)\b", re.I), "no legal action"),
|
| 72 |
-
(re.compile(r"\brelinquish(?:es|ed)?\s+(?:the\s+)?right\b", re.I),
|
| 73 |
-
(re.compile(r"\b(?:no|barred\s+from)\s+legal\s+proceedings?\b", re.I),
|
| 74 |
],
|
|
|
|
|
|
|
| 75 |
"unilateral_termination": [
|
| 76 |
-
(re.compile(r"\bmay\s+terminate\b", re.I),
|
| 77 |
-
(re.compile(r"\bsole\s+(?:discretion|option)\b", re.I), "sole discretion"),
|
| 78 |
(re.compile(r"\bterminate.*?\b(?:without\s+cause|for\s+convenience|at\s+will|unilaterally)\b", re.I), "without cause"),
|
| 79 |
-
(re.compile(r"\bat\s+(?:its|the)\s+
|
| 80 |
],
|
| 81 |
"notice_period_defined": [
|
| 82 |
(re.compile(r"\b\d+\s*(?:days?|weeks?|months?)\s*(?:prior\s+)?(?:written\s+)?notice\b", re.I), "N days notice"),
|
| 83 |
-
(re.compile(r"\bnotice\s+period\s+of\s+\d+\b", re.I),
|
| 84 |
],
|
|
|
|
|
|
|
| 85 |
"processes_personal_data": [
|
| 86 |
-
(re.compile(r"\bpersonal\s+(?:data|information)\b", re.I),
|
| 87 |
(re.compile(r"\b(?:collect|process|handle|store)\s+(?:and\s+\w+\s+)?(?:user|customer|individual)\s+(?:data|information)\b", re.I), "process user data"),
|
| 88 |
-
(re.compile(r"\bdata\s+(?:subject|principal)\b", re.I),
|
| 89 |
-
(re.compile(r"\bpii\b", re.I),
|
| 90 |
],
|
| 91 |
"processes_sensitive_data": [
|
| 92 |
-
(re.compile(r"\bsensitive\s+personal\s+(?:data|information)\b", re.I),
|
| 93 |
(re.compile(r"\b(?:health|medical|financial|biometric|aadhaar|aadhar)\s+(?:data|information|details)\b", re.I), "sensitive category"),
|
| 94 |
],
|
| 95 |
"has_data_retention_clause": [
|
| 96 |
-
(re.compile(r"\bretention\s+period\b", re.I),
|
| 97 |
(re.compile(r"\bretain(?:ed|s)?\s+(?:for\s+)?(?:a\s+period\s+of\s+)?\d+\b", re.I), "retain for N"),
|
| 98 |
(re.compile(r"\bdata\s+shall\s+be\s+(?:deleted|purged|anonymised|destroyed)\b", re.I), "data deletion"),
|
| 99 |
-
(re.compile(r"\
|
| 100 |
],
|
| 101 |
"has_breach_notification": [
|
| 102 |
-
(re.compile(r"\bbreach\s+notification\b", re.I),
|
| 103 |
-
(re.compile(r"\bnotify\s+(?:of\s+)?(?:any\s+)?(?:data\s+)?breach\b", re.I),
|
| 104 |
(re.compile(r"\b(?:report|inform|notify)\s+(?:the\s+\w+\s+)?within\s+\d+\s*(?:hours?|days?)\b", re.I), "notify within N"),
|
| 105 |
-
(re.compile(r"\bsecurity\s+incident\b", re.I),
|
| 106 |
],
|
| 107 |
"has_consent_clause": [
|
| 108 |
(re.compile(r"\b(?:with|upon|after)\s+(?:the\s+)?(?:prior\s+|explicit\s+|written\s+)?consent\s+of\b", re.I), "with consent of"),
|
| 109 |
-
(re.compile(r"\bopt[\-\s]?in\b", re.I),
|
| 110 |
-
(re.compile(r"\bexplicit\s+consent\b", re.I),
|
| 111 |
],
|
| 112 |
"handles_digital_data": [
|
| 113 |
-
(re.compile(r"\b(?:digital|electronic|online|cloud[\-\s]?based)\b", re.I),
|
| 114 |
-
(re.compile(r"\b(?:server|database|api|software\s+platform|saas)\b", re.I),
|
| 115 |
],
|
| 116 |
"has_security_clause": [
|
| 117 |
-
(re.compile(r"\b(?:reasonable\s+)?security\s+measures?\b", re.I),
|
| 118 |
-
(re.compile(r"\bencryption\b", re.I),
|
| 119 |
-
(re.compile(r"\bcyber[\-\s]?security\b", re.I),
|
| 120 |
-
(re.compile(r"\baccess\s+controls?\b", re.I),
|
| 121 |
-
(re.compile(r"\biso\s*27001\b", re.I),
|
| 122 |
],
|
|
|
|
|
|
|
| 123 |
"assigns_all_ip": [
|
| 124 |
-
(re.compile(r"\ball\s+intellectual\s+property\b", re.I),
|
| 125 |
-
(re.compile(r"\bassigns?\s+(?:all\s+)?(?:rights?|ip|intellectual)\b", re.I),
|
| 126 |
(re.compile(r"\b(?:belongs|vests|shall\s+vest)\s+(?:in|to)\s+(?:the\s+)?(?:client|company|customer|employer)\b", re.I), "vests in client"),
|
| 127 |
],
|
| 128 |
"includes_pre_existing_ip": [
|
| 129 |
-
(re.compile(r"\bpre[\-\s]?existing\b", re.I),
|
| 130 |
-
(re.compile(r"\bbackground\s+(?:ip|intellectual\s+property)\b", re.I),
|
| 131 |
-
(re.compile(r"\bprior\s+to\s+(?:the\s+)?engagement\b", re.I),
|
| 132 |
],
|
|
|
|
|
|
|
| 133 |
"is_consumer_contract": [
|
| 134 |
(re.compile(r"\b(?:consumer|end[\-\s]?user|retail\s+customer|individual\s+customer)\b", re.I), "consumer/end-user"),
|
| 135 |
],
|
| 136 |
"has_one_sided_clause": [
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
(re.compile(r"\
|
| 140 |
-
(re.compile(r"\
|
| 141 |
-
(re.compile(r"\
|
|
|
|
|
|
|
| 142 |
],
|
| 143 |
-
|
|
|
|
| 144 |
"has_indemnity_clause": [
|
| 145 |
-
(re.compile(r"\bindemnif(?:y|ies|ied|ication)\b", re.I),
|
| 146 |
-
(re.compile(r"\bhold\s+harmless\b", re.I),
|
| 147 |
-
(re.compile(r"\bdefend\s+(?:and\s+indemnify|the\s+\w+\s+against)\b", re.I),
|
| 148 |
],
|
| 149 |
"indemnity_capped": [
|
| 150 |
(re.compile(r"\bindemn\w*[^.]{0,80}?\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)\b", re.I), "indemnity cap"),
|
| 151 |
(re.compile(r"\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)[^.]{0,80}?\bindemn\w*\b", re.I), "indemnity cap"),
|
| 152 |
],
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
"has_auto_renewal": [
|
| 155 |
-
(re.compile(r"\bauto(?:matically)?[\-\s]?renew(?:s|al|ed|ing)?\b", re.I),
|
| 156 |
-
(re.compile(r"\brenew(?:s|al)?\s+automatically\b", re.I),
|
| 157 |
-
(re.compile(r"\bevergreen\s+(?:clause|term)\b", re.I),
|
| 158 |
-
(re.compile(r"\bshall\s+continue\s+(?:to\s+)?renew\b", re.I),
|
| 159 |
],
|
| 160 |
"has_opt_out_window": [
|
| 161 |
(re.compile(r"\b\d+\s*(?:days?|weeks?|months?)[^.]{0,60}?(?:prior\s+to|before)\s+(?:the\s+)?(?:renewal|expiry|expiration|end)\b", re.I), "notice before renewal"),
|
| 162 |
-
(re.compile(r"\bnon[\-\s]?renewal\s+notice\b", re.I),
|
| 163 |
-
(re.compile(r"\b(?:opt[\-\s]?out|terminate)\s+(?:the\s+)?renewal\b", re.I),
|
|
|
|
|
|
|
| 164 |
],
|
| 165 |
-
|
|
|
|
| 166 |
"has_arbitration": [
|
| 167 |
-
(re.compile(r"\barbitrat(?:ion|or|al)\b", re.I),
|
| 168 |
-
(re.compile(r"\barbitral\s+tribunal\b", re.I),
|
| 169 |
],
|
| 170 |
"arbitration_distant_venue": [
|
| 171 |
-
(re.compile(r"\b(?:seat|venue|place)\s+of\s+arbitration\s+(?:shall\s+be\s+|is\s+)?(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign seat"),
|
| 172 |
-
(re.compile(r"\barbitrat\w*[^.]{0,60}?\b(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign arbitration venue"),
|
| 173 |
],
|
| 174 |
-
|
|
|
|
| 175 |
"has_exclusivity": [
|
| 176 |
-
|
| 177 |
-
(re.compile(r"\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
],
|
| 179 |
"exclusivity_term_defined": [
|
| 180 |
-
(re.compile(r"\bexclusiv\w*[^.]{0,80}?\b(?:for\s+a\s+period\s+of|until|through|expires?|terminates?)\b[^.]{0,30}?\d+
|
| 181 |
-
(re.compile(r"\bexclusiv\w*[^.]{0,80}?\b\d+\s*(?:years?|months?)\b", re.I),
|
|
|
|
| 182 |
],
|
| 183 |
-
|
|
|
|
| 184 |
"unilateral_price_change": [
|
| 185 |
(re.compile(r"\bmay\s+(?:change|modify|adjust|revise|increase|update)\s+(?:the\s+)?(?:prices?|fees?|charges?|rates?)\b", re.I), "may change prices"),
|
| 186 |
(re.compile(r"\b(?:prices?|fees?|charges?|rates?)\s+(?:may\s+be|are\s+subject\s+to)\s+(?:changed|modified|adjusted|revised)\s+at\s+(?:our|its|the\s+\w+s?)\s+(?:sole\s+)?discretion\b", re.I), "prices changed at discretion"),
|
|
@@ -197,7 +217,6 @@ class ClauseFeatureExtractor:
|
|
| 197 |
"has_exclusivity": ["exclusivity_term_defined"],
|
| 198 |
}
|
| 199 |
|
| 200 |
-
# ββ Core extraction ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 201 |
def extract(self, text: str) -> tuple[dict, dict]:
|
| 202 |
features: dict = {}
|
| 203 |
evidence: dict = {}
|
|
@@ -229,6 +248,7 @@ class ClauseFeatureExtractor:
|
|
| 229 |
if child not in features:
|
| 230 |
features[child] = False
|
| 231 |
|
|
|
|
| 232 |
m = re.search(
|
| 233 |
r'(\d+)\s*(?:\(\w+\)\s*)?\s*years?\s*(?:of\s+)?(?:the\s+)?non[\-\s]?compet|'
|
| 234 |
r'non[\-\s]?compet[a-z]*[^.]{0,40}?(\d+)\s*years?',
|
|
@@ -243,15 +263,13 @@ class ClauseFeatureExtractor:
|
|
| 243 |
"span": [start, end],
|
| 244 |
"label": f"{yrs}-year non-compete",
|
| 245 |
}]
|
| 246 |
-
elif
|
| 247 |
-
r"\bnon[\-\s]?compete\b", r"\bshall\s+not\s+compete\b")):
|
| 248 |
features["non_compete_years"] = 1
|
| 249 |
|
| 250 |
return features, evidence
|
| 251 |
|
| 252 |
-
# ββ Convenience accessors ββββββββββββββββββββββββββββββββββ
|
| 253 |
def extract_unified(self, text: str) -> dict:
|
| 254 |
-
"""Riya's Prompt 3 format: features merged with a flat evidence list."""
|
| 255 |
features, evidence_dict = self.extract(text)
|
| 256 |
out = dict(features)
|
| 257 |
out["evidence"] = self.flatten_evidence(evidence_dict)
|
|
|
|
| 1 |
# feature_extractor.py
|
| 2 |
+
# v5.3 β precision-tightened regex pack.
|
| 3 |
+
# Critical changes:
|
| 4 |
+
# β’ is_wagering_clause: strict gambling vocab only (no "contingent on β¦")
|
| 5 |
+
# β’ has_exclusivity: contextual phrases only (no bare "exclusive" / "exclusive of")
|
| 6 |
+
# β’ has_uncapped_signal: NEW β gates ICA_007 to require explicit uncapped language
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
import re
|
| 10 |
|
| 11 |
|
| 12 |
class ClauseFeatureExtractor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
BOOLEAN_PATTERNS: dict[str, list[tuple[re.Pattern, str]]] = {
|
| 14 |
+
# ββ Liability cap (unchanged) ββββββββββββββββββββββββββββββββββββββ
|
| 15 |
"has_liability_cap": [
|
| 16 |
+
(re.compile(r"\bshall\s+not\s+exceed\b", re.I), "shall not exceed"),
|
| 17 |
(re.compile(r"\b(?:maximum|max\.?|total|aggregate|cumulative)\s+liabilit(?:y|ies)\b", re.I), "max liability"),
|
| 18 |
+
(re.compile(r"\bcap(?:ped)?\s+(?:on\s+|of\s+)?liabilit(?:y|ies)\b", re.I), "cap on liability"),
|
| 19 |
(re.compile(r"\blimited\s+to\s+(?:an?\s+amount|the\s+(?:greater|lesser)|rs\.?|inr|usd|\$|βΉ)", re.I), "limited to (amount)"),
|
| 20 |
+
(re.compile(r"\bliability\s+(?:is\s+|shall\s+be\s+)?limited\b", re.I), "liability limited"),
|
| 21 |
],
|
| 22 |
"excludes_gross_negligence": [
|
| 23 |
+
(re.compile(r"\bgross\s+negligen(?:ce|t)\b", re.I), "gross negligence"),
|
| 24 |
+
(re.compile(r"\bwil?l?ful\s+(?:misconduct|default|breach)\b", re.I), "wilful misconduct"),
|
| 25 |
+
(re.compile(r"\bintentional\s+(?:misconduct|breach|wrongdoing|act)\b", re.I), "intentional misconduct"),
|
| 26 |
+
(re.compile(r"\brecklessness?\b", re.I), "recklessness"),
|
| 27 |
+
(re.compile(r"\bbad\s+faith\b", re.I), "bad faith"),
|
| 28 |
+
(re.compile(r"\bfraud(?:ulent)?\b", re.I), "fraud"),
|
| 29 |
],
|
| 30 |
+
|
| 31 |
+
# ββ Liquidated damages (unchanged) βββββββββββββββββββββββββββββββββ
|
| 32 |
"has_liquidated_damages": [
|
| 33 |
+
(re.compile(r"\bliquidated\s+damages?\b", re.I), "liquidated damages"),
|
| 34 |
(re.compile(r"\bpre[\-\s]?(?:determined|estimated|agreed)\s+damages?\b", re.I), "pre-determined damages"),
|
| 35 |
+
(re.compile(r"\bpenalty\s+(?:amount|sum|of)\b", re.I), "penalty amount"),
|
| 36 |
],
|
| 37 |
"damages_exceed_loss": [
|
| 38 |
(re.compile(r"\bregardless\s+of\s+(?:actual\s+)?(?:loss|damage|harm)\b", re.I), "regardless of loss"),
|
| 39 |
+
(re.compile(r"\birrespective\s+of\s+(?:actual\s+)?(?:loss|damage)\b", re.I), "irrespective of loss"),
|
| 40 |
+
(re.compile(r"\bwithout\s+proof\s+of\s+(?:damage|loss)\b", re.I), "without proof of loss"),
|
| 41 |
+
(re.compile(r"\bpenalty\s+clause\b", re.I), "penalty clause"),
|
| 42 |
],
|
| 43 |
+
|
| 44 |
+
# ββ WAGERING β TIGHTENED (no "contingent on"; explicit gambling vocab only)
|
| 45 |
"is_wagering_clause": [
|
| 46 |
+
(re.compile(r"\b(?:wager|wagers|wagering|wagered)\b", re.I), "wager"),
|
| 47 |
+
(re.compile(r"\bgambling\b", re.I), "gambling"),
|
| 48 |
+
(re.compile(r"\blotter(?:y|ies)\b", re.I), "lottery"),
|
| 49 |
+
(re.compile(r"\bbetting\b", re.I), "betting"),
|
| 50 |
+
# "bet/bets on X" or "bet against X" β require directional preposition
|
| 51 |
+
(re.compile(r"\bbets?\s+(?:on|against)\s+(?:the|any|an|a)\s+\w+", re.I), "bet on/against"),
|
| 52 |
+
# "stake on X" or "stake against X" β same treatment
|
| 53 |
+
(re.compile(r"\bstakes?\s+(?:on|against)\s+(?:the|any|an|a)\s+\w+", re.I), "stake on/against"),
|
| 54 |
],
|
| 55 |
+
|
| 56 |
"restrains_legal_proceedings": [
|
| 57 |
(re.compile(r"\bwaive(?:s|d)?\s+(?:the\s+)?right\s+to\s+(?:sue|bring|file|institute)\b", re.I), "waive right to sue"),
|
| 58 |
(re.compile(r"\b(?:shall|will)\s+not\s+(?:bring|file|commence|institute)\s+(?:any\s+)?(?:action|suit|proceeding)\b", re.I), "no legal action"),
|
| 59 |
+
(re.compile(r"\brelinquish(?:es|ed)?\s+(?:the\s+)?right\s+to\s+(?:sue|claim|recover)\b", re.I), "relinquish right (to sue)"),
|
| 60 |
+
(re.compile(r"\b(?:no|barred\s+from)\s+legal\s+proceedings?\b", re.I), "no legal proceedings"),
|
| 61 |
],
|
| 62 |
+
|
| 63 |
+
# ββ Termination (unchanged) ββββββββββββββββββββββββββββββββββββββββ
|
| 64 |
"unilateral_termination": [
|
| 65 |
+
(re.compile(r"\bmay\s+terminate\b", re.I), "may terminate"),
|
|
|
|
| 66 |
(re.compile(r"\bterminate.*?\b(?:without\s+cause|for\s+convenience|at\s+will|unilaterally)\b", re.I), "without cause"),
|
| 67 |
+
(re.compile(r"\bat\s+(?:its|the)\s+sole\s+discretion\b[^.]{0,40}?\bterminat", re.I), "terminate at sole discretion"),
|
| 68 |
],
|
| 69 |
"notice_period_defined": [
|
| 70 |
(re.compile(r"\b\d+\s*(?:days?|weeks?|months?)\s*(?:prior\s+)?(?:written\s+)?notice\b", re.I), "N days notice"),
|
| 71 |
+
(re.compile(r"\bnotice\s+period\s+of\s+\d+\b", re.I), "notice period of N"),
|
| 72 |
],
|
| 73 |
+
|
| 74 |
+
# ββ DPDPA / IT Act / personal-data signals (unchanged) βββββββββββββ
|
| 75 |
"processes_personal_data": [
|
| 76 |
+
(re.compile(r"\bpersonal\s+(?:data|information)\b", re.I), "personal data"),
|
| 77 |
(re.compile(r"\b(?:collect|process|handle|store)\s+(?:and\s+\w+\s+)?(?:user|customer|individual)\s+(?:data|information)\b", re.I), "process user data"),
|
| 78 |
+
(re.compile(r"\bdata\s+(?:subject|principal)\b", re.I), "data subject/principal"),
|
| 79 |
+
(re.compile(r"\bpii\b", re.I), "PII"),
|
| 80 |
],
|
| 81 |
"processes_sensitive_data": [
|
| 82 |
+
(re.compile(r"\bsensitive\s+personal\s+(?:data|information)\b", re.I), "sensitive personal data"),
|
| 83 |
(re.compile(r"\b(?:health|medical|financial|biometric|aadhaar|aadhar)\s+(?:data|information|details)\b", re.I), "sensitive category"),
|
| 84 |
],
|
| 85 |
"has_data_retention_clause": [
|
| 86 |
+
(re.compile(r"\bretention\s+period\b", re.I), "retention period"),
|
| 87 |
(re.compile(r"\bretain(?:ed|s)?\s+(?:for\s+)?(?:a\s+period\s+of\s+)?\d+\b", re.I), "retain for N"),
|
| 88 |
(re.compile(r"\bdata\s+shall\s+be\s+(?:deleted|purged|anonymised|destroyed)\b", re.I), "data deletion"),
|
| 89 |
+
(re.compile(r"\b(?:purge|anonymise|delete)\s+(?:after|upon)\b", re.I), "purge/delete after"),
|
| 90 |
],
|
| 91 |
"has_breach_notification": [
|
| 92 |
+
(re.compile(r"\bbreach\s+notification\b", re.I), "breach notification"),
|
| 93 |
+
(re.compile(r"\bnotify\s+(?:of\s+)?(?:any\s+)?(?:data\s+)?breach\b", re.I), "notify of breach"),
|
| 94 |
(re.compile(r"\b(?:report|inform|notify)\s+(?:the\s+\w+\s+)?within\s+\d+\s*(?:hours?|days?)\b", re.I), "notify within N"),
|
| 95 |
+
(re.compile(r"\bsecurity\s+incident\b", re.I), "security incident"),
|
| 96 |
],
|
| 97 |
"has_consent_clause": [
|
| 98 |
(re.compile(r"\b(?:with|upon|after)\s+(?:the\s+)?(?:prior\s+|explicit\s+|written\s+)?consent\s+of\b", re.I), "with consent of"),
|
| 99 |
+
(re.compile(r"\bopt[\-\s]?in\b", re.I), "opt-in"),
|
| 100 |
+
(re.compile(r"\bexplicit\s+consent\b", re.I), "explicit consent"),
|
| 101 |
],
|
| 102 |
"handles_digital_data": [
|
| 103 |
+
(re.compile(r"\b(?:digital|electronic|online|cloud[\-\s]?based)\b", re.I), "digital/online/cloud"),
|
| 104 |
+
(re.compile(r"\b(?:server|database|api|software\s+platform|saas)\b", re.I), "server/db/SaaS"),
|
| 105 |
],
|
| 106 |
"has_security_clause": [
|
| 107 |
+
(re.compile(r"\b(?:reasonable\s+)?security\s+measures?\b", re.I), "security measures"),
|
| 108 |
+
(re.compile(r"\bencryption\b", re.I), "encryption"),
|
| 109 |
+
(re.compile(r"\bcyber[\-\s]?security\b", re.I), "cybersecurity"),
|
| 110 |
+
(re.compile(r"\baccess\s+controls?\b", re.I), "access controls"),
|
| 111 |
+
(re.compile(r"\biso\s*27001\b", re.I), "ISO 27001"),
|
| 112 |
],
|
| 113 |
+
|
| 114 |
+
# ββ IP (unchanged) βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 115 |
"assigns_all_ip": [
|
| 116 |
+
(re.compile(r"\ball\s+intellectual\s+property\b", re.I), "all intellectual property"),
|
| 117 |
+
(re.compile(r"\bassigns?\s+(?:all\s+)?(?:rights?|ip|intellectual)\b", re.I), "assigns all IP"),
|
| 118 |
(re.compile(r"\b(?:belongs|vests|shall\s+vest)\s+(?:in|to)\s+(?:the\s+)?(?:client|company|customer|employer)\b", re.I), "vests in client"),
|
| 119 |
],
|
| 120 |
"includes_pre_existing_ip": [
|
| 121 |
+
(re.compile(r"\bpre[\-\s]?existing\s+(?:ip|intellectual|materials?|works?|inventions?)\b", re.I), "pre-existing"),
|
| 122 |
+
(re.compile(r"\bbackground\s+(?:ip|intellectual\s+property)\b", re.I), "background IP"),
|
| 123 |
+
(re.compile(r"\bprior\s+to\s+(?:the\s+)?engagement\b", re.I), "prior to engagement"),
|
| 124 |
],
|
| 125 |
+
|
| 126 |
+
# ββ Consumer (one-sided narrowed slightly) βββββββββββββββββββββββββ
|
| 127 |
"is_consumer_contract": [
|
| 128 |
(re.compile(r"\b(?:consumer|end[\-\s]?user|retail\s+customer|individual\s+customer)\b", re.I), "consumer/end-user"),
|
| 129 |
],
|
| 130 |
"has_one_sided_clause": [
|
| 131 |
+
# "sole discretion" must be paired with a unilateral action verb to
|
| 132 |
+
# avoid firing on operational discretion language.
|
| 133 |
+
(re.compile(r"\bsole\s+discretion\b[^.]{0,50}?\b(?:terminate|modify|change|amend|deny|reject|refuse)\b", re.I), "sole discretion to terminate/modify"),
|
| 134 |
+
(re.compile(r"\bwithout\s+(?:any\s+)?liability\b", re.I), "without liability"),
|
| 135 |
+
(re.compile(r"\bno\s+obligation\s+(?:to|whatsoever)\b", re.I), "no obligation"),
|
| 136 |
+
(re.compile(r"\babsolute\s+(?:right|discretion)\b", re.I), "absolute right"),
|
| 137 |
+
(re.compile(r"\bunconditionally\b", re.I), "unconditionally"),
|
| 138 |
],
|
| 139 |
+
|
| 140 |
+
# ββ Indemnity ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 141 |
"has_indemnity_clause": [
|
| 142 |
+
(re.compile(r"\bindemnif(?:y|ies|ied|ication)\b", re.I), "indemnify/indemnification"),
|
| 143 |
+
(re.compile(r"\bhold\s+harmless\b", re.I), "hold harmless"),
|
| 144 |
+
(re.compile(r"\bdefend\s+(?:and\s+indemnify|the\s+\w+\s+against)\b", re.I), "defend & indemnify"),
|
| 145 |
],
|
| 146 |
"indemnity_capped": [
|
| 147 |
(re.compile(r"\bindemn\w*[^.]{0,80}?\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)\b", re.I), "indemnity cap"),
|
| 148 |
(re.compile(r"\b(?:cap(?:ped)?|limited\s+to|shall\s+not\s+exceed|maximum)[^.]{0,80}?\bindemn\w*\b", re.I), "indemnity cap"),
|
| 149 |
],
|
| 150 |
+
|
| 151 |
+
# ββ NEW: uncapped / unlimited liability signals (gates ICA_007) βββ
|
| 152 |
+
"has_uncapped_signal": [
|
| 153 |
+
(re.compile(r"\bunlimited\s+(?:liabilit(?:y|ies)|exposure|damages?)\b", re.I), "unlimited liability"),
|
| 154 |
+
(re.compile(r"\bwithout\s+(?:any\s+)?(?:limit|limitation|cap|ceiling)\b", re.I), "without limit"),
|
| 155 |
+
(re.compile(r"\bno\s+(?:cap|limit|ceiling|maximum)\b", re.I), "no cap"),
|
| 156 |
+
(re.compile(r"\bany\s+and\s+all\s+(?:claims?|damages?|losses?|expenses?|liabilit(?:y|ies))\b", re.I), "any and all claims/damages"),
|
| 157 |
+
(re.compile(r"\b(?:all|every)\s+(?:and\s+any\s+)?(?:claims?|damages?|losses?|liabilit(?:y|ies))\s+(?:whatsoever|of\s+any\s+kind)\b", re.I), "all damages whatsoever"),
|
| 158 |
+
],
|
| 159 |
+
|
| 160 |
+
# ββ Auto-renewal (unchanged) βββββββββββββββββββββββββββββββββββββββ
|
| 161 |
"has_auto_renewal": [
|
| 162 |
+
(re.compile(r"\bauto(?:matically)?[\-\s]?renew(?:s|al|ed|ing)?\b", re.I), "auto-renew"),
|
| 163 |
+
(re.compile(r"\brenew(?:s|al)?\s+automatically\b", re.I), "renews automatically"),
|
| 164 |
+
(re.compile(r"\bevergreen\s+(?:clause|term)\b", re.I), "evergreen clause"),
|
| 165 |
+
(re.compile(r"\bshall\s+continue\s+(?:to\s+)?renew\b", re.I), "continue to renew"),
|
| 166 |
],
|
| 167 |
"has_opt_out_window": [
|
| 168 |
(re.compile(r"\b\d+\s*(?:days?|weeks?|months?)[^.]{0,60}?(?:prior\s+to|before)\s+(?:the\s+)?(?:renewal|expiry|expiration|end)\b", re.I), "notice before renewal"),
|
| 169 |
+
(re.compile(r"\bnon[\-\s]?renewal\s+notice\b", re.I), "non-renewal notice"),
|
| 170 |
+
(re.compile(r"\b(?:opt[\-\s]?out|terminate)\s+(?:the\s+)?renewal\b", re.I), "opt-out renewal"),
|
| 171 |
+
# Plain "30 days notice" near "renew" is a common opt-out
|
| 172 |
+
(re.compile(r"\brenew\w*[^.]{0,80}?\b\d+\s*(?:days?|weeks?|months?)\s+(?:written\s+)?notice\b", re.I), "N-day notice before renewal"),
|
| 173 |
],
|
| 174 |
+
|
| 175 |
+
# ββ Arbitration (unchanged) ββββββββββββββββββββββββββββββββββββββββ
|
| 176 |
"has_arbitration": [
|
| 177 |
+
(re.compile(r"\barbitrat(?:ion|or|al)\b", re.I), "arbitration"),
|
| 178 |
+
(re.compile(r"\barbitral\s+tribunal\b", re.I), "arbitral tribunal"),
|
| 179 |
],
|
| 180 |
"arbitration_distant_venue": [
|
| 181 |
+
(re.compile(r"\b(?:seat|venue|place)\s+of\s+arbitration\s+(?:shall\s+be\s+|is\s+|in\s+)?(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign seat"),
|
| 182 |
+
(re.compile(r"\barbitrat\w*[^.]{0,60}?\b(?:in|at)\s+(?:singapore|london|new\s+york|hong\s+kong|dubai|paris|zurich|geneva)\b", re.I), "foreign arbitration venue"),
|
| 183 |
],
|
| 184 |
+
|
| 185 |
+
# ββ EXCLUSIVITY β TIGHTENED (won't match "exclusive of β¦") ββββββββ
|
| 186 |
"has_exclusivity": [
|
| 187 |
+
# Contextual nouns β "exclusive rights", "exclusive license", "exclusive distributor", etc.
|
| 188 |
+
(re.compile(r"\bexclusive\s+(?:right|rights|licen[sc]e|licen[sc]ee|distributor|supplier|vendor|agent|territory|market|partner|reseller|dealer)\b", re.I), "exclusive [right/license/distributor/...]"),
|
| 189 |
+
# Canonical exclusivity idioms
|
| 190 |
+
(re.compile(r"\bsole\s+and\s+exclusive\b", re.I), "sole and exclusive"),
|
| 191 |
+
(re.compile(r"\bshall\s+exclusively\b", re.I), "shall exclusively"),
|
| 192 |
+
(re.compile(r"\bgrant(?:s|ed|ing)?\s+(?:an?\s+|the\s+)?exclusive\s+(?:right|rights|licen[sc]e)\b", re.I), "grants exclusive right/license"),
|
| 193 |
+
# "exclusively to/with/for [party]" β directional, not "exclusive of"
|
| 194 |
+
(re.compile(r"\bexclusively\s+(?:to|with|for|by)\s+(?:the|its|a)\s+\w+", re.I), "exclusively to/with"),
|
| 195 |
+
(re.compile(r"\bon\s+an\s+exclusive\s+basis\b", re.I), "on an exclusive basis"),
|
| 196 |
],
|
| 197 |
"exclusivity_term_defined": [
|
| 198 |
+
(re.compile(r"\bexclusiv\w*[^.]{0,80}?\b(?:for\s+a\s+(?:period|term)\s+of|until|through|expires?|terminates?)\b[^.]{0,30}?\d+", re.I), "exclusivity term"),
|
| 199 |
+
(re.compile(r"\bexclusiv\w*[^.]{0,80}?\b\d+\s*(?:years?|months?)\b", re.I), "exclusivity duration"),
|
| 200 |
+
(re.compile(r"\bexclusiv\w*[^.]{0,80}?\bduring\s+the\s+(?:term|initial\s+term)\b", re.I), "during the term"),
|
| 201 |
],
|
| 202 |
+
|
| 203 |
+
# ββ Pricing (unchanged) ββββββββββββββββββββββββββββββββββββββββββββ
|
| 204 |
"unilateral_price_change": [
|
| 205 |
(re.compile(r"\bmay\s+(?:change|modify|adjust|revise|increase|update)\s+(?:the\s+)?(?:prices?|fees?|charges?|rates?)\b", re.I), "may change prices"),
|
| 206 |
(re.compile(r"\b(?:prices?|fees?|charges?|rates?)\s+(?:may\s+be|are\s+subject\s+to)\s+(?:changed|modified|adjusted|revised)\s+at\s+(?:our|its|the\s+\w+s?)\s+(?:sole\s+)?discretion\b", re.I), "prices changed at discretion"),
|
|
|
|
| 217 |
"has_exclusivity": ["exclusivity_term_defined"],
|
| 218 |
}
|
| 219 |
|
|
|
|
| 220 |
def extract(self, text: str) -> tuple[dict, dict]:
|
| 221 |
features: dict = {}
|
| 222 |
evidence: dict = {}
|
|
|
|
| 248 |
if child not in features:
|
| 249 |
features[child] = False
|
| 250 |
|
| 251 |
+
# Non-compete years β accept either ordering, but require explicit context
|
| 252 |
m = re.search(
|
| 253 |
r'(\d+)\s*(?:\(\w+\)\s*)?\s*years?\s*(?:of\s+)?(?:the\s+)?non[\-\s]?compet|'
|
| 254 |
r'non[\-\s]?compet[a-z]*[^.]{0,40}?(\d+)\s*years?',
|
|
|
|
| 263 |
"span": [start, end],
|
| 264 |
"label": f"{yrs}-year non-compete",
|
| 265 |
}]
|
| 266 |
+
elif re.search(r"\bnon[\-\s]?compete\b|\bshall\s+not\s+compete\b", text, re.I):
|
|
|
|
| 267 |
features["non_compete_years"] = 1
|
| 268 |
|
| 269 |
return features, evidence
|
| 270 |
|
| 271 |
+
# ββ Convenience accessors (unchanged) ββββββββββββββββββββββββββββββββββ
|
| 272 |
def extract_unified(self, text: str) -> dict:
|
|
|
|
| 273 |
features, evidence_dict = self.extract(text)
|
| 274 |
out = dict(features)
|
| 275 |
out["evidence"] = self.flatten_evidence(evidence_dict)
|
inference.py
CHANGED
|
@@ -1,20 +1,21 @@
|
|
| 1 |
# inference.py
|
| 2 |
-
#
|
| 3 |
-
#
|
| 4 |
-
#
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
|
| 8 |
-
# ββ Clause types that should be weighted toward symbolic rules ββββββββββββββ
|
| 9 |
IP_CLAUSE_TYPES = {
|
| 10 |
"IP Ownership Assignment", "Joint IP Ownership",
|
| 11 |
"Irrevocable Or Perpetual License",
|
| 12 |
"Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
|
| 13 |
}
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
#
|
| 17 |
-
# without inspecting lambda bytecode. Keep in sync with SYMBOLIC_RULES in app.py.
|
| 18 |
RULE_FEATURE_DEPS = {
|
| 19 |
"ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
|
| 20 |
"ICA_002": ["unilateral_termination", "notice_period_defined"],
|
|
@@ -22,7 +23,8 @@ RULE_FEATURE_DEPS = {
|
|
| 22 |
"ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
|
| 23 |
"ICA_005": ["is_wagering_clause"],
|
| 24 |
"ICA_006": ["restrains_legal_proceedings"],
|
| 25 |
-
|
|
|
|
| 26 |
"ICA_008": ["has_auto_renewal", "has_opt_out_window"],
|
| 27 |
"ICA_009": ["has_arbitration", "arbitration_distant_venue"],
|
| 28 |
"ICA_010": ["has_exclusivity", "exclusivity_term_defined"],
|
|
@@ -35,16 +37,22 @@ RULE_FEATURE_DEPS = {
|
|
| 35 |
"CPA_001": ["is_consumer_contract", "has_one_sided_clause"],
|
| 36 |
}
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
|
| 40 |
-
"""
|
| 41 |
-
Evaluate symbolic rules against extracted features.
|
| 42 |
-
Returns:
|
| 43 |
-
{
|
| 44 |
-
"symbolic_score": float, # clamped to [0, 1]
|
| 45 |
-
"triggered_rules": list[dict], # rules whose condition fired
|
| 46 |
-
}
|
| 47 |
-
"""
|
| 48 |
triggered, total = [], 0.0
|
| 49 |
for rule in symbolic_rules:
|
| 50 |
try:
|
|
@@ -52,7 +60,6 @@ def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
|
|
| 52 |
triggered.append(rule)
|
| 53 |
total += rule["penalty"]
|
| 54 |
except Exception:
|
| 55 |
-
# A malformed rule must not crash inference.
|
| 56 |
pass
|
| 57 |
return {
|
| 58 |
"symbolic_score": round(min(total, 1.0), 3),
|
|
@@ -66,46 +73,24 @@ def _neuro_symbolic_fusion(
|
|
| 66 |
is_ip_clause: bool = False,
|
| 67 |
) -> dict:
|
| 68 |
"""
|
| 69 |
-
Weighted fusion
|
| 70 |
-
|
| 71 |
-
IP clauses shift weight toward symbolic rules (which capture IP-specific law).
|
| 72 |
-
A non-zero symbolic score forces a Medium-or-higher floor (since rule triggers
|
| 73 |
-
represent deterministic legal violations).
|
| 74 |
-
|
| 75 |
-
Returns:
|
| 76 |
-
{
|
| 77 |
-
"score": float,
|
| 78 |
-
"level": "Low" | "Medium" | "High",
|
| 79 |
-
"emoji": str,
|
| 80 |
-
"breakdown": {
|
| 81 |
-
"neural_score": float,
|
| 82 |
-
"symbolic_score": float,
|
| 83 |
-
"weights": {"neural": float, "symbolic": float},
|
| 84 |
-
"raw_fused": float, # pre-floor
|
| 85 |
-
"floor_applied": bool,
|
| 86 |
-
"final": float,
|
| 87 |
-
"formula": str, # human-readable computation
|
| 88 |
-
},
|
| 89 |
-
}
|
| 90 |
"""
|
|
|
|
|
|
|
| 91 |
if is_ip_clause and symbolic > 0:
|
| 92 |
-
w_n, w_s = 0.35, 0.65
|
| 93 |
-
else:
|
| 94 |
w_n, w_s = 0.60, 0.40
|
|
|
|
|
|
|
| 95 |
|
| 96 |
raw = w_n * neural + w_s * symbolic
|
| 97 |
-
|
| 98 |
-
score = max(raw, 0.30) if floor else raw
|
| 99 |
-
score = round(min(score, 1.0), 3)
|
| 100 |
|
| 101 |
-
|
| 102 |
-
elif score <= 0.66: level, emoji = "Medium", "π‘"
|
| 103 |
-
else: level, emoji = "High", "π΄"
|
| 104 |
|
| 105 |
formula = (
|
| 106 |
f"({w_n:.2f} Γ {neural:.3f}) + ({w_s:.2f} Γ {symbolic:.3f}) "
|
| 107 |
f"= {round(raw, 3)}"
|
| 108 |
-
+ (f" β floor 0.30 applied (symbolic triggers present)" if floor else "")
|
| 109 |
)
|
| 110 |
|
| 111 |
return {
|
|
@@ -117,7 +102,7 @@ def _neuro_symbolic_fusion(
|
|
| 117 |
"symbolic_score": round(symbolic, 3),
|
| 118 |
"weights": {"neural": w_n, "symbolic": w_s},
|
| 119 |
"raw_fused": round(raw, 3),
|
| 120 |
-
"floor_applied":
|
| 121 |
"final": score,
|
| 122 |
"formula": formula,
|
| 123 |
},
|
|
@@ -132,30 +117,19 @@ def _compute_confidence(
|
|
| 132 |
neural_loaded: bool = True,
|
| 133 |
) -> dict:
|
| 134 |
"""
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
Returns:
|
| 141 |
-
{
|
| 142 |
-
"level": "Low" | "Medium" | "High",
|
| 143 |
-
"score": float,
|
| 144 |
-
"factors": { "boundary_dist": float, "agreement": float,
|
| 145 |
-
"rule_strength": float },
|
| 146 |
-
}
|
| 147 |
"""
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
dist_factor = min(boundary_dist / 0.18, 1.0)
|
| 151 |
|
| 152 |
-
# Neural vs symbolic agreement (only meaningful if neural is loaded)
|
| 153 |
if neural_loaded:
|
| 154 |
agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)
|
| 155 |
else:
|
| 156 |
-
agree_factor = 0.5
|
| 157 |
|
| 158 |
-
# Rule signal β more triggers = stronger deterministic evidence
|
| 159 |
if num_triggered == 0: rule_factor = 0.40
|
| 160 |
elif num_triggered == 1: rule_factor = 0.70
|
| 161 |
else: rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)
|
|
|
|
| 1 |
# inference.py
|
| 2 |
+
# Hardening v5.3:
|
| 3 |
+
# - Neural-dominant fusion (default 0.75 / 0.25, IP 0.60 / 0.40)
|
| 4 |
+
# - Symbolic floor of 0.30 REMOVED
|
| 5 |
+
# - Risk-level thresholds: Low < 0.50, Medium 0.50β0.80, High > 0.80
|
| 6 |
+
# - Confidence recalibrated for the new thresholds
|
| 7 |
+
# - RULE_FEATURE_DEPS updated for tightened ICA_007
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
|
|
|
| 11 |
IP_CLAUSE_TYPES = {
|
| 12 |
"IP Ownership Assignment", "Joint IP Ownership",
|
| 13 |
"Irrevocable Or Perpetual License",
|
| 14 |
"Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
|
| 15 |
}
|
| 16 |
|
| 17 |
+
# Rule -> feature dependencies. Used by the explanation engine to surface
|
| 18 |
+
# matched evidence per rule (no lambda introspection required).
|
|
|
|
| 19 |
RULE_FEATURE_DEPS = {
|
| 20 |
"ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
|
| 21 |
"ICA_002": ["unilateral_termination", "notice_period_defined"],
|
|
|
|
| 23 |
"ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
|
| 24 |
"ICA_005": ["is_wagering_clause"],
|
| 25 |
"ICA_006": ["restrains_legal_proceedings"],
|
| 26 |
+
# ICA_007 tightened: now requires has_uncapped_signal too
|
| 27 |
+
"ICA_007": ["has_indemnity_clause", "indemnity_capped", "has_uncapped_signal"],
|
| 28 |
"ICA_008": ["has_auto_renewal", "has_opt_out_window"],
|
| 29 |
"ICA_009": ["has_arbitration", "arbitration_distant_venue"],
|
| 30 |
"ICA_010": ["has_exclusivity", "exclusivity_term_defined"],
|
|
|
|
| 37 |
"CPA_001": ["is_consumer_contract", "has_one_sided_clause"],
|
| 38 |
}
|
| 39 |
|
| 40 |
+
# ββ Risk-level thresholds (single source of truth) ββββββββββββββββββββββββββ
|
| 41 |
+
RISK_LOW_MAX = 0.50 # < 0.50 β Low
|
| 42 |
+
RISK_MEDIUM_MAX = 0.80 # 0.50β0.80 β Medium; > 0.80 β High
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def level_from_score(score: float) -> tuple[str, str]:
|
| 46 |
+
"""Return (level_label, emoji) for a fused score under the v5.3 thresholds."""
|
| 47 |
+
if score < RISK_LOW_MAX:
|
| 48 |
+
return "Low", "π’"
|
| 49 |
+
if score <= RISK_MEDIUM_MAX:
|
| 50 |
+
return "Medium", "π‘"
|
| 51 |
+
return "High", "π΄"
|
| 52 |
+
|
| 53 |
|
| 54 |
def _symbolic_rule_score(features: dict, symbolic_rules: list) -> dict:
|
| 55 |
+
"""Evaluate symbolic rules. Score is clamped to [0, 1]."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
triggered, total = [], 0.0
|
| 57 |
for rule in symbolic_rules:
|
| 58 |
try:
|
|
|
|
| 60 |
triggered.append(rule)
|
| 61 |
total += rule["penalty"]
|
| 62 |
except Exception:
|
|
|
|
| 63 |
pass
|
| 64 |
return {
|
| 65 |
"symbolic_score": round(min(total, 1.0), 3),
|
|
|
|
| 73 |
is_ip_clause: bool = False,
|
| 74 |
) -> dict:
|
| 75 |
"""
|
| 76 |
+
Weighted fusion β neural-dominant by design.
|
| 77 |
+
No artificial floor: a weak symbolic trigger no longer inflates risk.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
"""
|
| 79 |
+
# Default neural-dominant. IP clauses give a bit more weight to symbolic,
|
| 80 |
+
# but symbolic NEVER outweighs neural.
|
| 81 |
if is_ip_clause and symbolic > 0:
|
|
|
|
|
|
|
| 82 |
w_n, w_s = 0.60, 0.40
|
| 83 |
+
else:
|
| 84 |
+
w_n, w_s = 0.75, 0.25
|
| 85 |
|
| 86 |
raw = w_n * neural + w_s * symbolic
|
| 87 |
+
score = round(min(max(raw, 0.0), 1.0), 3)
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
level, emoji = level_from_score(score)
|
|
|
|
|
|
|
| 90 |
|
| 91 |
formula = (
|
| 92 |
f"({w_n:.2f} Γ {neural:.3f}) + ({w_s:.2f} Γ {symbolic:.3f}) "
|
| 93 |
f"= {round(raw, 3)}"
|
|
|
|
| 94 |
)
|
| 95 |
|
| 96 |
return {
|
|
|
|
| 102 |
"symbolic_score": round(symbolic, 3),
|
| 103 |
"weights": {"neural": w_n, "symbolic": w_s},
|
| 104 |
"raw_fused": round(raw, 3),
|
| 105 |
+
"floor_applied": False, # retained for UI compat; always False now
|
| 106 |
"final": score,
|
| 107 |
"formula": formula,
|
| 108 |
},
|
|
|
|
| 117 |
neural_loaded: bool = True,
|
| 118 |
) -> dict:
|
| 119 |
"""
|
| 120 |
+
Three-factor confidence calibrated for the new thresholds.
|
| 121 |
+
boundary_dist β distance from the nearest risk-level cutoff (0.50, 0.80)
|
| 122 |
+
agreement β 1 - |neural - symbolic| (only when neural is loaded)
|
| 123 |
+
rule_strength β more triggered rules β stronger deterministic evidence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
"""
|
| 125 |
+
boundary_dist = min(abs(fused - RISK_LOW_MAX), abs(fused - RISK_MEDIUM_MAX))
|
| 126 |
+
dist_factor = min(boundary_dist / 0.20, 1.0)
|
|
|
|
| 127 |
|
|
|
|
| 128 |
if neural_loaded:
|
| 129 |
agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)
|
| 130 |
else:
|
| 131 |
+
agree_factor = 0.5
|
| 132 |
|
|
|
|
| 133 |
if num_triggered == 0: rule_factor = 0.40
|
| 134 |
elif num_triggered == 1: rule_factor = 0.70
|
| 135 |
else: rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)
|
local_interpreters.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
# local_interpreters.py
|
| 2 |
-
#
|
| 3 |
-
#
|
| 4 |
-
#
|
| 5 |
-
# when a clause is opened β never during batch analysis)
|
| 6 |
|
| 7 |
from __future__ import annotations
|
|
|
|
| 8 |
import time
|
| 9 |
import numpy as np
|
| 10 |
import torch
|
|
@@ -16,15 +16,65 @@ except Exception:
|
|
| 16 |
_LIME_AVAILABLE = False
|
| 17 |
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
class LocalExplainer:
|
| 20 |
-
"""
|
| 21 |
-
LIME + attention extraction.
|
| 22 |
-
|
| 23 |
-
Methods:
|
| 24 |
-
β’ explain_with_lime(text, predict_fn) -> list[{word, weight}]
|
| 25 |
-
β’ get_attention_map(text, model, tokenizer) -> list[{token, weight}]
|
| 26 |
-
"""
|
| 27 |
-
|
| 28 |
def __init__(self, num_samples: int = 50, timeout_seconds: float = 30.0):
|
| 29 |
self.num_samples = num_samples
|
| 30 |
self.timeout_seconds = timeout_seconds
|
|
@@ -44,38 +94,43 @@ class LocalExplainer:
|
|
| 44 |
self,
|
| 45 |
text: str,
|
| 46 |
predict_fn,
|
| 47 |
-
num_features: int =
|
|
|
|
| 48 |
) -> list[dict]:
|
| 49 |
"""
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
column 1 = "risky" prob (risk_score)
|
| 53 |
-
Returns a sorted list of {word, weight} (positive = pushes toward risky).
|
| 54 |
"""
|
| 55 |
if not _LIME_AVAILABLE or self._lime is None:
|
| 56 |
return []
|
| 57 |
try:
|
| 58 |
t0 = time.time()
|
| 59 |
exp = self._lime.explain_instance(
|
| 60 |
-
text_instance=text[:1500],
|
| 61 |
classifier_fn=predict_fn,
|
| 62 |
num_features=num_features,
|
| 63 |
num_samples=self.num_samples,
|
| 64 |
labels=(1,),
|
| 65 |
)
|
| 66 |
elapsed = time.time() - t0
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
for w, s in
|
| 72 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
except Exception as e:
|
| 74 |
self.last_error = str(e)
|
| 75 |
print(f"[WARN] LIME failed: {e}")
|
| 76 |
return []
|
| 77 |
|
| 78 |
-
# ββ Attention ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
def get_attention_map(
|
| 80 |
self,
|
| 81 |
text: str,
|
|
@@ -84,12 +139,6 @@ class LocalExplainer:
|
|
| 84 |
max_length: int = 256,
|
| 85 |
top_k: int = 30,
|
| 86 |
) -> list[dict]:
|
| 87 |
-
"""
|
| 88 |
-
Extract Longformer last-layer global attention from CLS over the
|
| 89 |
-
sequence, average over heads, return top_k tokens by weight.
|
| 90 |
-
Runs a second forward pass with output_attentions=True (memory cost
|
| 91 |
-
is bounded because we only do this for a single clause on demand).
|
| 92 |
-
"""
|
| 93 |
if model is None or tokenizer is None:
|
| 94 |
return []
|
| 95 |
try:
|
|
@@ -100,8 +149,6 @@ class LocalExplainer:
|
|
| 100 |
input_ids = enc["input_ids"]
|
| 101 |
attention_mask = enc["attention_mask"]
|
| 102 |
|
| 103 |
-
# Force the CLS token (position 0) to use global attention so we
|
| 104 |
-
# get a proper distribution over the whole sequence.
|
| 105 |
global_mask = torch.zeros_like(input_ids)
|
| 106 |
global_mask[:, 0] = 1
|
| 107 |
|
|
@@ -113,20 +160,16 @@ class LocalExplainer:
|
|
| 113 |
output_attentions = True,
|
| 114 |
)
|
| 115 |
|
| 116 |
-
# Longformer exposes global_attentions when global tokens exist.
|
| 117 |
-
# Shape: tuple of (batch, num_heads, num_global_tokens, seq_len)
|
| 118 |
if not getattr(out, "global_attentions", None):
|
| 119 |
return []
|
| 120 |
|
| 121 |
-
last_global = out.global_attentions[-1]
|
| 122 |
-
cls_attn = last_global[0, :, 0, :].mean(dim=0)
|
| 123 |
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
tokens = tokenizer.convert_ids_to_tokens(input_ids[0][:n_real])
|
| 127 |
weights = cls_attn[:n_real].cpu().numpy()
|
| 128 |
|
| 129 |
-
# Skip special tokens for the ranking, but keep them in the sequence
|
| 130 |
specials = {tokenizer.cls_token, tokenizer.sep_token,
|
| 131 |
tokenizer.pad_token, tokenizer.bos_token,
|
| 132 |
tokenizer.eos_token, tokenizer.unk_token}
|
|
@@ -145,15 +188,11 @@ class LocalExplainer:
|
|
| 145 |
return []
|
| 146 |
|
| 147 |
|
| 148 |
-
# ββ
|
|
|
|
|
|
|
| 149 |
def build_predict_fn_for_manager(manager, max_length: int = 256):
|
| 150 |
-
"""
|
| 151 |
-
Returns a callable LIME can use: list[str] -> np.ndarray (n, 2).
|
| 152 |
-
Uses the manager's model + tokenizer. Falls back to symbolic if the
|
| 153 |
-
neural model isn't loaded (each text just gets [1-sym, sym]).
|
| 154 |
-
"""
|
| 155 |
def predict_fn(texts: list[str]) -> np.ndarray:
|
| 156 |
-
# Symbolic-only fallback path
|
| 157 |
if not (manager.is_ready and manager.model is not None):
|
| 158 |
from inference import _symbolic_rule_score
|
| 159 |
from app import SYMBOLIC_RULES
|
|
@@ -164,7 +203,6 @@ def build_predict_fn_for_manager(manager, max_length: int = 256):
|
|
| 164 |
probs.append([1.0 - sym, sym])
|
| 165 |
return np.array(probs, dtype=np.float32)
|
| 166 |
|
| 167 |
-
# Neural path β batch through Longformer
|
| 168 |
enc = manager.tokenizer(
|
| 169 |
list(texts),
|
| 170 |
padding="max_length", truncation=True,
|
|
|
|
| 1 |
# local_interpreters.py
|
| 2 |
+
# v5.3 β LIME output filtered for legal interpretability.
|
| 3 |
+
# The neural model still sees ORIGINAL text. Filtering happens at the
|
| 4 |
+
# display layer only.
|
|
|
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
+
import re
|
| 8 |
import time
|
| 9 |
import numpy as np
|
| 10 |
import torch
|
|
|
|
| 16 |
_LIME_AVAILABLE = False
|
| 17 |
|
| 18 |
|
| 19 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
+
# Token filtering β display-time only
|
| 21 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
LIME_STOPWORDS = {
|
| 23 |
+
"a", "an", "the",
|
| 24 |
+
"of", "in", "on", "at", "to", "for", "by", "with", "from",
|
| 25 |
+
"and", "or", "but",
|
| 26 |
+
"this", "that", "these", "those",
|
| 27 |
+
"it", "its",
|
| 28 |
+
"be", "is", "are", "was", "were", "been", "being",
|
| 29 |
+
"have", "has", "had", "do", "does", "did",
|
| 30 |
+
"as", "if", "so", "than", "then",
|
| 31 |
+
"any", "all", "such", "no", # ambiguous but mostly noise here
|
| 32 |
+
"i", "we", "you", "they", "he", "she",
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
# Legal modal / operative words β never filter these even if they look small
|
| 36 |
+
LIME_KEEP_LEGAL = {
|
| 37 |
+
"shall", "may", "must", "not", "only", "unless", "except", "without",
|
| 38 |
+
"subject", "liable", "liability", "indemnify", "indemnity", "indemnification",
|
| 39 |
+
"terminate", "termination", "exclusive", "exclusively", "exclusivity",
|
| 40 |
+
"warrant", "warranty", "breach", "obligation", "covenant", "license",
|
| 41 |
+
"licence", "damages", "consent", "notice", "renew", "renewal",
|
| 42 |
+
"arbitration", "arbitrator", "jurisdiction", "wager", "gambling",
|
| 43 |
+
"assign", "assignment", "limit", "cap", "uncapped", "unlimited",
|
| 44 |
+
"confidential", "disclose", "non-compete", "non-solicit",
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
# Roman numeral regex (used for filtering things like "IV", "xii")
|
| 48 |
+
_ROMAN_NUMERAL = re.compile(r"^[ivxlcdm]+\.?$", re.IGNORECASE)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _clean_token(raw: str) -> str:
|
| 52 |
+
"""Strip leading/trailing punctuation; return lowercased core."""
|
| 53 |
+
return re.sub(r'^[^\w]+|[^\w]+$', '', raw).lower()
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _is_useful_lime_token(raw: str) -> bool:
|
| 57 |
+
core = _clean_token(raw)
|
| 58 |
+
if not core:
|
| 59 |
+
return False
|
| 60 |
+
if core in LIME_KEEP_LEGAL:
|
| 61 |
+
return True
|
| 62 |
+
if core in LIME_STOPWORDS:
|
| 63 |
+
return False
|
| 64 |
+
if core.isdigit():
|
| 65 |
+
return False
|
| 66 |
+
if _ROMAN_NUMERAL.fullmatch(core):
|
| 67 |
+
return False
|
| 68 |
+
# Need at least 2 alphanumeric chars to be a meaningful word
|
| 69 |
+
if sum(c.isalnum() for c in core) < 2:
|
| 70 |
+
return False
|
| 71 |
+
return True
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 75 |
+
# LocalExplainer
|
| 76 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
class LocalExplainer:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
def __init__(self, num_samples: int = 50, timeout_seconds: float = 30.0):
|
| 79 |
self.num_samples = num_samples
|
| 80 |
self.timeout_seconds = timeout_seconds
|
|
|
|
| 94 |
self,
|
| 95 |
text: str,
|
| 96 |
predict_fn,
|
| 97 |
+
num_features: int = 25, # raw β over-request, filter later
|
| 98 |
+
display_count: int = 12,
|
| 99 |
) -> list[dict]:
|
| 100 |
"""
|
| 101 |
+
Returns up to `display_count` filtered token contributions, sorted by
|
| 102 |
+
absolute weight. The model itself still sees the original full text.
|
|
|
|
|
|
|
| 103 |
"""
|
| 104 |
if not _LIME_AVAILABLE or self._lime is None:
|
| 105 |
return []
|
| 106 |
try:
|
| 107 |
t0 = time.time()
|
| 108 |
exp = self._lime.explain_instance(
|
| 109 |
+
text_instance=text[:1500],
|
| 110 |
classifier_fn=predict_fn,
|
| 111 |
num_features=num_features,
|
| 112 |
num_samples=self.num_samples,
|
| 113 |
labels=(1,),
|
| 114 |
)
|
| 115 |
elapsed = time.time() - t0
|
| 116 |
+
raw_pairs = exp.as_list(label=1)
|
| 117 |
+
|
| 118 |
+
# Filter for legal interpretability
|
| 119 |
+
filtered: list[tuple[str, float]] = [
|
| 120 |
+
(w, float(s)) for w, s in raw_pairs if _is_useful_lime_token(w)
|
| 121 |
]
|
| 122 |
+
filtered.sort(key=lambda x: abs(x[1]), reverse=True)
|
| 123 |
+
top = filtered[:display_count]
|
| 124 |
+
|
| 125 |
+
print(f"[INFO] LIME {elapsed:.1f}s, raw={len(raw_pairs)}, "
|
| 126 |
+
f"filtered={len(filtered)}, displayed={len(top)}")
|
| 127 |
+
return [{"word": w, "weight": round(s, 4)} for w, s in top]
|
| 128 |
except Exception as e:
|
| 129 |
self.last_error = str(e)
|
| 130 |
print(f"[WARN] LIME failed: {e}")
|
| 131 |
return []
|
| 132 |
|
| 133 |
+
# ββ Attention (unchanged) ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 134 |
def get_attention_map(
|
| 135 |
self,
|
| 136 |
text: str,
|
|
|
|
| 139 |
max_length: int = 256,
|
| 140 |
top_k: int = 30,
|
| 141 |
) -> list[dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
if model is None or tokenizer is None:
|
| 143 |
return []
|
| 144 |
try:
|
|
|
|
| 149 |
input_ids = enc["input_ids"]
|
| 150 |
attention_mask = enc["attention_mask"]
|
| 151 |
|
|
|
|
|
|
|
| 152 |
global_mask = torch.zeros_like(input_ids)
|
| 153 |
global_mask[:, 0] = 1
|
| 154 |
|
|
|
|
| 160 |
output_attentions = True,
|
| 161 |
)
|
| 162 |
|
|
|
|
|
|
|
| 163 |
if not getattr(out, "global_attentions", None):
|
| 164 |
return []
|
| 165 |
|
| 166 |
+
last_global = out.global_attentions[-1]
|
| 167 |
+
cls_attn = last_global[0, :, 0, :].mean(dim=0)
|
| 168 |
|
| 169 |
+
n_real = int(attention_mask[0].sum().item())
|
| 170 |
+
tokens = tokenizer.convert_ids_to_tokens(input_ids[0][:n_real])
|
|
|
|
| 171 |
weights = cls_attn[:n_real].cpu().numpy()
|
| 172 |
|
|
|
|
| 173 |
specials = {tokenizer.cls_token, tokenizer.sep_token,
|
| 174 |
tokenizer.pad_token, tokenizer.bos_token,
|
| 175 |
tokenizer.eos_token, tokenizer.unk_token}
|
|
|
|
| 188 |
return []
|
| 189 |
|
| 190 |
|
| 191 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 192 |
+
# Predict-fn factory (unchanged)
|
| 193 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 194 |
def build_predict_fn_for_manager(manager, max_length: int = 256):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
def predict_fn(texts: list[str]) -> np.ndarray:
|
|
|
|
| 196 |
if not (manager.is_ready and manager.model is not None):
|
| 197 |
from inference import _symbolic_rule_score
|
| 198 |
from app import SYMBOLIC_RULES
|
|
|
|
| 203 |
probs.append([1.0 - sym, sym])
|
| 204 |
return np.array(probs, dtype=np.float32)
|
| 205 |
|
|
|
|
| 206 |
enc = manager.tokenizer(
|
| 207 |
list(texts),
|
| 208 |
padding="max_length", truncation=True,
|
pdf_utils.py
CHANGED
|
@@ -1,157 +1,188 @@
|
|
| 1 |
# pdf_utils.py
|
| 2 |
-
#
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
from __future__ import annotations
|
| 5 |
import re
|
| 6 |
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
"""
|
| 12 |
-
import fitz # pymupdf
|
| 13 |
-
|
| 14 |
-
doc = fitz.open(file_path)
|
| 15 |
-
pages = []
|
| 16 |
|
| 17 |
-
for page in doc:
|
| 18 |
-
text = page.get_text("text")
|
| 19 |
-
pages.append(text)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
doc.close()
|
| 22 |
-
|
| 23 |
raw = "\n".join(pages)
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
raw = re.sub(r
|
| 27 |
-
raw = re.sub(r
|
| 28 |
-
raw = re.sub(r"\n{3,}", "\n\n", raw)
|
| 29 |
-
|
| 30 |
return raw.strip()
|
| 31 |
|
| 32 |
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
"""
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
Returns
|
| 38 |
-
list[str]
|
| 39 |
"""
|
| 40 |
-
|
| 41 |
-
|
|
|
|
| 42 |
|
|
|
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
{
|
| 58 |
-
"text":
|
| 59 |
-
"number":
|
| 60 |
-
"kind": "
|
| 61 |
}
|
|
|
|
| 62 |
]
|
| 63 |
-
"""
|
| 64 |
-
|
| 65 |
-
if not text or not text.strip():
|
| 66 |
-
return []
|
| 67 |
-
|
| 68 |
-
text = text.strip()
|
| 69 |
|
| 70 |
-
numbered_pattern = re.compile(
|
| 71 |
-
r'(?m)^(?=\s*(?:\d+(?:\.\d+)*\.?|Article\s+\d+|Section\s+\d+))',
|
| 72 |
-
re.IGNORECASE
|
| 73 |
-
)
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
)
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
return match.group(1) if match else None
|
| 95 |
-
|
| 96 |
-
return None
|
| 97 |
-
|
| 98 |
-
def build_metadata(parts: list[str], kind: str) -> list[dict]:
|
| 99 |
-
results = []
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
| 103 |
|
| 104 |
-
|
| 105 |
-
continue
|
| 106 |
|
| 107 |
-
results.append({
|
| 108 |
-
"text": clause,
|
| 109 |
-
"number": extract_clause_number(clause, kind),
|
| 110 |
-
"kind": kind
|
| 111 |
-
})
|
| 112 |
-
|
| 113 |
-
return results
|
| 114 |
-
|
| 115 |
-
# Strategy 1: numbered clauses
|
| 116 |
-
numbered_parts = [p.strip() for p in numbered_pattern.split(text) if p.strip()]
|
| 117 |
-
if len(numbered_parts) > 1:
|
| 118 |
-
result = build_metadata(numbered_parts, "numbered")
|
| 119 |
-
if len(result) > 1:
|
| 120 |
-
return result
|
| 121 |
-
|
| 122 |
-
# Strategy 2: lettered clauses
|
| 123 |
-
lettered_parts = [p.strip() for p in lettered_pattern.split(text) if p.strip()]
|
| 124 |
-
if len(lettered_parts) > 1:
|
| 125 |
-
result = build_metadata(lettered_parts, "lettered")
|
| 126 |
-
if len(result) > 1:
|
| 127 |
-
return result
|
| 128 |
-
|
| 129 |
-
# Strategy 3: ALL CAPS headings
|
| 130 |
-
caps_parts = [p.strip() for p in caps_pattern.split(text) if p.strip()]
|
| 131 |
-
if len(caps_parts) > 1:
|
| 132 |
-
result = build_metadata(caps_parts, "caps_heading")
|
| 133 |
-
if len(result) > 1:
|
| 134 |
-
return result
|
| 135 |
-
|
| 136 |
-
# Strategy 4: paragraph fallback
|
| 137 |
-
paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
|
| 138 |
-
|
| 139 |
-
fallback = []
|
| 140 |
-
|
| 141 |
-
for para in paragraphs:
|
| 142 |
-
if len(para) >= min_length:
|
| 143 |
-
fallback.append({
|
| 144 |
-
"text": para,
|
| 145 |
-
"number": None,
|
| 146 |
-
"kind": "paragraph"
|
| 147 |
-
})
|
| 148 |
-
|
| 149 |
-
# Final fallback: whole document as one clause
|
| 150 |
-
if not fallback and len(text) >= min_length:
|
| 151 |
-
fallback.append({
|
| 152 |
-
"text": text,
|
| 153 |
-
"number": None,
|
| 154 |
-
"kind": "full_text"
|
| 155 |
-
})
|
| 156 |
|
| 157 |
-
|
|
|
|
|
|
|
|
|
| 1 |
# pdf_utils.py
|
| 2 |
+
# v5.3 β finer-grained clause segmentation.
|
| 3 |
+
# Adds:
|
| 4 |
+
# β’ Inline subclause splitting for long clauses (a), (b), (c), (i), (ii) β¦
|
| 5 |
+
# β’ Hard length cap with sentence-boundary fallback
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
import re
|
| 9 |
|
| 10 |
|
| 11 |
+
LONG_CLAUSE_CHARS = 1200
|
| 12 |
+
MAX_CLAUSE_CHARS = 3000
|
| 13 |
+
MIN_SUBCLAUSE_LEN = 60
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
+
# PDF extraction (unchanged)
|
| 18 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
def extract_text_from_pdf(file_path: str) -> str:
|
| 20 |
+
import fitz
|
| 21 |
+
doc = fitz.open(file_path)
|
| 22 |
+
pages = [page.get_text("text") for page in doc]
|
| 23 |
doc.close()
|
|
|
|
| 24 |
raw = "\n".join(pages)
|
| 25 |
+
raw = re.sub(r'\r\n', '\n', raw)
|
| 26 |
+
raw = re.sub(r'[ \t]+', ' ', raw)
|
| 27 |
+
raw = re.sub(r'\n{3,}', '\n\n', raw)
|
| 28 |
+
raw = re.sub(r'(\w)-\n(\w)', r'\1\2', raw)
|
|
|
|
|
|
|
| 29 |
return raw.strip()
|
| 30 |
|
| 31 |
|
| 32 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
# Header detection (primary segmentation)
|
| 34 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
_HEADER_PATTERNS: list[tuple[str, re.Pattern]] = [
|
| 36 |
+
("decimal", re.compile(r'(?m)^\s*(\d+(?:\.\d+){0,3}\.?)\s+(?=\S)')),
|
| 37 |
+
("article", re.compile(
|
| 38 |
+
r'(?m)^\s*((?:Article|Section|Clause|Schedule|Annexure|Annex|Appendix|Part|Chapter)'
|
| 39 |
+
r'\s+(?:\d+(?:\.\d+){0,2}|[IVXLC]+))[\s\.\-:]', re.IGNORECASE)),
|
| 40 |
+
("lettered", re.compile(r'(?m)^\s*(\(\s*[a-zA-Z]{1,4}\s*\))\s+(?=\S)')),
|
| 41 |
+
("roman", re.compile(r'(?m)^\s*([IVX]{1,5}\.)\s+(?=\S)')),
|
| 42 |
+
("caps", re.compile(r'(?m)^([A-Z][A-Z0-9 &/\-]{4,59})\s*$')),
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
# Inline subclause markers β used in the SECOND pass (mid-text, not line-start)
|
| 46 |
+
_INLINE_SUBCLAUSE = re.compile(
|
| 47 |
+
r'(?<=[\s\.\;\:])(\(\s*(?:[a-z]|[ivx]{1,4})\s*\))\s+(?=[A-Z\w])',
|
| 48 |
+
re.IGNORECASE,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _collect_headers(text: str) -> list[tuple[int, str, str]]:
|
| 53 |
+
hits: list[tuple[int, str, str]] = []
|
| 54 |
+
for kind, pat in _HEADER_PATTERNS:
|
| 55 |
+
for m in pat.finditer(text):
|
| 56 |
+
hits.append((m.start(1), m.group(1).strip(), kind))
|
| 57 |
+
hits.sort(key=lambda h: h[0])
|
| 58 |
+
|
| 59 |
+
deduped: list[tuple[int, str, str]] = []
|
| 60 |
+
for h in hits:
|
| 61 |
+
if not deduped or abs(h[0] - deduped[-1][0]) > 2:
|
| 62 |
+
deduped.append(h)
|
| 63 |
+
return deduped
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
# Inline subclause post-processing
|
| 68 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 69 |
+
def _split_inline_subclauses(
|
| 70 |
+
body: str,
|
| 71 |
+
parent_number: str | None = None,
|
| 72 |
+
min_length: int = MIN_SUBCLAUSE_LEN,
|
| 73 |
+
) -> list[dict]:
|
| 74 |
"""
|
| 75 |
+
If the clause body contains β₯ 2 inline subclause markers, split it.
|
| 76 |
+
Subclause numbers are prefixed with the parent (e.g. "5.7" + "(a)" β "5.7(a)").
|
| 77 |
+
Returns [] if no useful split is possible (caller keeps the original).
|
|
|
|
| 78 |
"""
|
| 79 |
+
matches = list(_INLINE_SUBCLAUSE.finditer(body))
|
| 80 |
+
if len(matches) < 2:
|
| 81 |
+
return []
|
| 82 |
|
| 83 |
+
parts: list[dict] = []
|
| 84 |
|
| 85 |
+
# First chunk: text before the first marker (usually the parent header line)
|
| 86 |
+
head = body[:matches[0].start()].strip()
|
| 87 |
+
if head and len(head) >= 30:
|
| 88 |
+
parts.append({
|
| 89 |
+
"text": head,
|
| 90 |
+
"number": parent_number,
|
| 91 |
+
"kind": "decimal" if parent_number else "paragraph",
|
| 92 |
+
})
|
| 93 |
|
| 94 |
+
for i, m in enumerate(matches):
|
| 95 |
+
start = m.start()
|
| 96 |
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
|
| 97 |
+
chunk = body[start:end].strip()
|
| 98 |
+
if len(chunk) < min_length:
|
| 99 |
+
# Too short to be a real subclause β fold into previous
|
| 100 |
+
if parts:
|
| 101 |
+
parts[-1]["text"] = (parts[-1]["text"] + "\n" + chunk).strip()
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
sub_marker = m.group(1).strip()
|
| 105 |
+
composite = f"{parent_number}{sub_marker}" if parent_number else sub_marker
|
| 106 |
+
parts.append({
|
| 107 |
+
"text": chunk,
|
| 108 |
+
"number": composite,
|
| 109 |
+
"kind": "subclause",
|
| 110 |
+
})
|
| 111 |
|
| 112 |
+
return parts
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _hard_cap_split(clause: dict, max_len: int = MAX_CLAUSE_CHARS) -> list[dict]:
|
| 116 |
+
"""If still too long, split on sentence boundaries to bound LM input."""
|
| 117 |
+
body = clause["text"]
|
| 118 |
+
if len(body) <= max_len:
|
| 119 |
+
return [clause]
|
| 120 |
+
sentences = re.split(r'(?<=[\.\?\!])\s+(?=[A-Z])', body)
|
| 121 |
+
chunks, current = [], ""
|
| 122 |
+
for s in sentences:
|
| 123 |
+
if len(current) + len(s) + 1 > max_len and current:
|
| 124 |
+
chunks.append(current.strip())
|
| 125 |
+
current = s
|
| 126 |
+
else:
|
| 127 |
+
current = (current + " " + s).strip() if current else s
|
| 128 |
+
if current:
|
| 129 |
+
chunks.append(current.strip())
|
| 130 |
+
|
| 131 |
+
return [
|
| 132 |
{
|
| 133 |
+
"text": c,
|
| 134 |
+
"number": clause.get("number"),
|
| 135 |
+
"kind": clause.get("kind", "paragraph") + "/chunked",
|
| 136 |
}
|
| 137 |
+
for c in chunks if len(c) >= MIN_SUBCLAUSE_LEN
|
| 138 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 142 |
+
# Public API
|
| 143 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 144 |
+
def split_into_clauses_with_metadata(
|
| 145 |
+
text: str,
|
| 146 |
+
min_length: int = 40,
|
| 147 |
+
) -> list[dict]:
|
| 148 |
+
headers = _collect_headers(text)
|
| 149 |
+
|
| 150 |
+
# ββ Primary segmentation (heading-based) βββββββββββββββββββββββββββββββ
|
| 151 |
+
primary: list[dict] = []
|
| 152 |
+
if headers:
|
| 153 |
+
for i, (start, marker, kind) in enumerate(headers):
|
| 154 |
+
end = headers[i + 1][0] if i + 1 < len(headers) else len(text)
|
| 155 |
+
body = text[start:end].strip()
|
| 156 |
+
if len(body) >= min_length:
|
| 157 |
+
primary.append({"text": body, "number": marker, "kind": kind})
|
| 158 |
+
|
| 159 |
+
# Paragraph fallback when no headers were found
|
| 160 |
+
if not primary:
|
| 161 |
+
for p in [p.strip() for p in re.split(r'\n\s*\n', text)]:
|
| 162 |
+
if len(p) >= min_length:
|
| 163 |
+
primary.append({"text": p, "number": None, "kind": "paragraph"})
|
| 164 |
+
|
| 165 |
+
# ββ Secondary pass: inline subclause splitting for long clauses ββββββββ
|
| 166 |
+
refined: list[dict] = []
|
| 167 |
+
for clause in primary:
|
| 168 |
+
if len(clause["text"]) > LONG_CLAUSE_CHARS:
|
| 169 |
+
subs = _split_inline_subclauses(
|
| 170 |
+
clause["text"],
|
| 171 |
+
parent_number=clause.get("number"),
|
| 172 |
)
|
| 173 |
+
if subs:
|
| 174 |
+
refined.extend(subs)
|
| 175 |
+
continue
|
| 176 |
+
refined.append(clause)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
+
# ββ Tertiary pass: hard length cap (sentence-boundary chunking) ββββββββ
|
| 179 |
+
final: list[dict] = []
|
| 180 |
+
for clause in refined:
|
| 181 |
+
final.extend(_hard_cap_split(clause))
|
| 182 |
|
| 183 |
+
return final
|
|
|
|
| 184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
+
def split_into_clauses(text: str, min_length: int = 40) -> list[str]:
|
| 187 |
+
"""Backward-compat wrapper that returns plain strings."""
|
| 188 |
+
return [c["text"] for c in split_into_clauses_with_metadata(text, min_length)]
|