Spaces:

riyasuryawanshi746
/

Major_Project

Sleeping

App Files Files Community

riyasuryawanshi746 commited on 25 days ago

Commit

6893de4

verified ·

1 Parent(s): af910e9

Fixed PDF preprocessing and clause segmentation

Browse files

Files changed (4) hide show

app.py +7 -1
inference.py +45 -20
local_interpreters.py +24 -12
pdf_utils.py +205 -31

app.py CHANGED Viewed

@@ -36,7 +36,7 @@ except Exception as _e:
 try:
     from local_interpreters import LocalExplainer, build_predict_fn_for_manager
-    local_explainer = LocalExplainer(num_samples=50, timeout_seconds=30.0)
 except Exception as _e:
     print(f"[WARN] LocalExplainer disabled: {_e}")
     local_explainer = None
@@ -252,6 +252,12 @@ class ModelManager:
         features, evidence = self.feature_extractor.extract(text)
         sym_result = _symbolic_rule_score(features, SYMBOLIC_RULES)
         if self.is_ready and self.model is not None:
             try:
                 enc = self.tokenizer(

 try:
     from local_interpreters import LocalExplainer, build_predict_fn_for_manager
+    local_explainer = LocalExplainer(num_samples=25, timeout_seconds=25.0)
 except Exception as _e:
     print(f"[WARN] LocalExplainer disabled: {_e}")
     local_explainer = None
         features, evidence = self.feature_extractor.extract(text)
         sym_result = _symbolic_rule_score(features, SYMBOLIC_RULES)
+        # v5.4: Sanity-check the clause text before inference.
+        # After pdf_utils filtering, this should always be real legal prose.
+        word_count = len(text.split())
+        print(f"[DEBUG] analyze_clause: {word_count} words | "
+              f"preview: {text[:80].replace(chr(10),' ')!r}")
         if self.is_ready and self.model is not None:
             try:
                 enc = self.tokenizer(

inference.py CHANGED Viewed

@@ -1,10 +1,12 @@
 # inference.py
-# Hardening v5.3:
-#  - Neural-dominant fusion (default 0.75 / 0.25, IP 0.60 / 0.40)
-#  - Symbolic floor of 0.30 REMOVED
-#  - Risk-level thresholds: Low < 0.50, Medium 0.50–0.80, High > 0.80
-#  - Confidence recalibrated for the new thresholds
-#  - RULE_FEATURE_DEPS updated for tightened ICA_007
 from __future__ import annotations
@@ -14,8 +16,7 @@ IP_CLAUSE_TYPES = {
     "Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
 }
-# Rule -> feature dependencies. Used by the explanation engine to surface
-# matched evidence per rule (no lambda introspection required).
 RULE_FEATURE_DEPS = {
     "ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
     "ICA_002": ["unilateral_termination", "notice_period_defined"],
@@ -23,7 +24,6 @@ RULE_FEATURE_DEPS = {
     "ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
     "ICA_005": ["is_wagering_clause"],
     "ICA_006": ["restrains_legal_proceedings"],
-    # ICA_007 tightened: now requires has_uncapped_signal too
     "ICA_007": ["has_indemnity_clause", "indemnity_capped", "has_uncapped_signal"],
     "ICA_008": ["has_auto_renewal", "has_opt_out_window"],
     "ICA_009": ["has_arbitration", "arbitration_distant_venue"],
@@ -41,9 +41,12 @@ RULE_FEATURE_DEPS = {
 RISK_LOW_MAX    = 0.50    # < 0.50 → Low
 RISK_MEDIUM_MAX = 0.80    # 0.50–0.80 → Medium; > 0.80 → High
 def level_from_score(score: float) -> tuple[str, str]:
-    """Return (level_label, emoji) for a fused score under the v5.3 thresholds."""
     if score < RISK_LOW_MAX:
         return "Low",    "🟢"
     if score <= RISK_MEDIUM_MAX:
@@ -76,8 +79,6 @@ def _neuro_symbolic_fusion(
     Weighted fusion — neural-dominant by design.
     No artificial floor: a weak symbolic trigger no longer inflates risk.
     """
-    # Default neural-dominant. IP clauses give a bit more weight to symbolic,
-    # but symbolic NEVER outweighs neural.
     if is_ip_clause and symbolic > 0:
         w_n, w_s = 0.60, 0.40
     else:
@@ -102,7 +103,7 @@ def _neuro_symbolic_fusion(
             "symbolic_score": round(symbolic, 3),
             "weights":        {"neural": w_n, "symbolic": w_s},
             "raw_fused":      round(raw, 3),
-            "floor_applied":  False,   # retained for UI compat; always False now
             "final":          score,
             "formula":        formula,
         },
@@ -117,19 +118,43 @@ def _compute_confidence(
     neural_loaded: bool = True,
 ) -> dict:
     """
-    Three-factor confidence calibrated for the new thresholds.
-      boundary_dist  – distance from the nearest risk-level cutoff (0.50, 0.80)
-      agreement      – 1 - |neural - symbolic|  (only when neural is loaded)
-      rule_strength  – more triggered rules ⇒ stronger deterministic evidence
     """
     boundary_dist = min(abs(fused - RISK_LOW_MAX), abs(fused - RISK_MEDIUM_MAX))
     dist_factor   = min(boundary_dist / 0.20, 1.0)
-    if neural_loaded:
-        agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)
     else:
-        agree_factor = 0.5
     if   num_triggered == 0: rule_factor = 0.40
     elif num_triggered == 1: rule_factor = 0.70
     else:                    rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)

 # inference.py
+# v5.4 — Confidence calibration fixed.
+# Root cause of "Neural=0.993, Confidence=LOW 21%" bug:
+#   The agreement factor penalized high-neural / zero-symbolic as "disagreement",
+#   but this is a legitimate state (neural model is certain; no rules triggered).
+#   Fix: agreement is now only computed between the two scores when BOTH are
+#   non-trivial (> 0.05). When symbolic is near zero, we treat the neural score
+#   alone as the evidence and give a neutral agreement factor of 0.70 rather
+#   than nearly 0.
 from __future__ import annotations
     "Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
 }
+# Rule -> feature dependencies (unchanged)
 RULE_FEATURE_DEPS = {
     "ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
     "ICA_002": ["unilateral_termination", "notice_period_defined"],
     "ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
     "ICA_005": ["is_wagering_clause"],
     "ICA_006": ["restrains_legal_proceedings"],
     "ICA_007": ["has_indemnity_clause", "indemnity_capped", "has_uncapped_signal"],
     "ICA_008": ["has_auto_renewal", "has_opt_out_window"],
     "ICA_009": ["has_arbitration", "arbitration_distant_venue"],
 RISK_LOW_MAX    = 0.50    # < 0.50 → Low
 RISK_MEDIUM_MAX = 0.80    # 0.50–0.80 → Medium; > 0.80 → High
+# Threshold below which a score is considered "near zero" for agreement logic
+_TRIVIAL_SCORE = 0.05
 def level_from_score(score: float) -> tuple[str, str]:
+    """Return (level_label, emoji) for a fused score under the v5.4 thresholds."""
     if score < RISK_LOW_MAX:
         return "Low",    "🟢"
     if score <= RISK_MEDIUM_MAX:
     Weighted fusion — neural-dominant by design.
     No artificial floor: a weak symbolic trigger no longer inflates risk.
     """
     if is_ip_clause and symbolic > 0:
         w_n, w_s = 0.60, 0.40
     else:
             "symbolic_score": round(symbolic, 3),
             "weights":        {"neural": w_n, "symbolic": w_s},
             "raw_fused":      round(raw, 3),
+            "floor_applied":  False,
             "final":          score,
             "formula":        formula,
         },
     neural_loaded: bool = True,
 ) -> dict:
     """
+    Three-factor confidence calibrated for v5.4.
+    boundary_dist  – distance from the nearest risk-level boundary (0.50, 0.80).
+                     A score far from any boundary is a clear-cut decision.
+    agreement      – alignment between neural and symbolic signals.
+                     FIX v5.4: when symbolic is near-zero (no rules triggered),
+                     we do NOT treat this as "disagreement". High neural + no
+                     symbolic rules is a perfectly consistent, informative state.
+                     Agreement is only penalised when BOTH scores are non-trivial
+                     and they point in opposite directions.
+    rule_strength  – more triggered rules ⇒ stronger deterministic evidence.
     """
+    # Factor 1: distance from nearest risk boundary
     boundary_dist = min(abs(fused - RISK_LOW_MAX), abs(fused - RISK_MEDIUM_MAX))
     dist_factor   = min(boundary_dist / 0.20, 1.0)
+    # Factor 2: agreement (FIXED)
+    if not neural_loaded:
+        # No neural signal at all — moderate confidence
+        agree_factor = 0.50
+    elif symbolic <= _TRIVIAL_SCORE:
+        # Symbolic is near-zero: no rules fired. Neural is the only signal.
+        # This is NOT disagreement — treat as a confident neural-only verdict.
+        # Scale agreement by how decisive the neural score is:
+        #   neural close to 0 or 1  → high confidence (0.80)
+        #   neural near 0.50 (borderline) → lower confidence (0.50)
+        neural_decisiveness = abs(neural - 0.50) / 0.50   # 0 at boundary, 1 at extremes
+        agree_factor = 0.50 + 0.30 * neural_decisiveness   # range [0.50, 0.80]
+    elif neural <= _TRIVIAL_SCORE:
+        # Neural is near-zero: symbolic rules fired but model disagrees.
+        # Genuine disagreement → low agreement factor.
+        agree_factor = 0.30
     else:
+        # Both signals are non-trivial: measure actual divergence.
+        agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)
+    # Factor 3: rule strength
     if   num_triggered == 0: rule_factor = 0.40
     elif num_triggered == 1: rule_factor = 0.70
     else:                    rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)

local_interpreters.py CHANGED Viewed

@@ -1,7 +1,13 @@
 # local_interpreters.py
-# v5.3 — LIME output filtered for legal interpretability.
-# The neural model still sees ORIGINAL text. Filtering happens at the
-# display layer only.
 from __future__ import annotations
 import re
@@ -17,7 +23,7 @@ except Exception:
 # ─────────────────────────────────────────────────────────────────────────────
-# Token filtering — display-time only
 # ─────────────────────────────────────────────────────────────────────────────
 LIME_STOPWORDS = {
     "a", "an", "the",
@@ -28,11 +34,10 @@ LIME_STOPWORDS = {
     "be", "is", "are", "was", "were", "been", "being",
     "have", "has", "had", "do", "does", "did",
     "as", "if", "so", "than", "then",
-    "any", "all", "such", "no",   # ambiguous but mostly noise here
     "i", "we", "you", "they", "he", "she",
 }
-# Legal modal / operative words — never filter these even if they look small
 LIME_KEEP_LEGAL = {
     "shall", "may", "must", "not", "only", "unless", "except", "without",
     "subject", "liable", "liability", "indemnify", "indemnity", "indemnification",
@@ -44,12 +49,10 @@ LIME_KEEP_LEGAL = {
     "confidential", "disclose", "non-compete", "non-solicit",
 }
-# Roman numeral regex (used for filtering things like "IV", "xii")
 _ROMAN_NUMERAL = re.compile(r"^[ivxlcdm]+\.?$", re.IGNORECASE)
 def _clean_token(raw: str) -> str:
-    """Strip leading/trailing punctuation; return lowercased core."""
     return re.sub(r'^[^\w]+|[^\w]+$', '', raw).lower()
@@ -65,7 +68,6 @@ def _is_useful_lime_token(raw: str) -> bool:
         return False
     if _ROMAN_NUMERAL.fullmatch(core):
         return False
-    # Need at least 2 alphanumeric chars to be a meaningful word
     if sum(c.isalnum() for c in core) < 2:
         return False
     return True
@@ -75,7 +77,11 @@ def _is_useful_lime_token(raw: str) -> bool:
 # LocalExplainer
 # ─────────────────────────────────────────────────────────────────────────────
 class LocalExplainer:
-    def __init__(self, num_samples: int = 50, timeout_seconds: float = 30.0):
         self.num_samples     = num_samples
         self.timeout_seconds = timeout_seconds
         self.last_error      = ""
@@ -100,6 +106,9 @@ class LocalExplainer:
         """
         Returns up to `display_count` filtered token contributions, sorted by
         absolute weight. The model itself still sees the original full text.
         """
         if not _LIME_AVAILABLE or self._lime is None:
             return []
@@ -109,13 +118,16 @@ class LocalExplainer:
                 text_instance=text[:1500],
                 classifier_fn=predict_fn,
                 num_features=num_features,
-                num_samples=self.num_samples,
                 labels=(1,),
             )
             elapsed = time.time() - t0
             raw_pairs = exp.as_list(label=1)
-            # Filter for legal interpretability
             filtered: list[tuple[str, float]] = [
                 (w, float(s)) for w, s in raw_pairs if _is_useful_lime_token(w)
             ]

 # local_interpreters.py
+# v5.4 — Performance optimisations:
+#   • Default LIME num_samples reduced from 50 → 25 (halves per-clause latency)
+#   • LocalExplainer.explain_with_lime() accepts a timeout and aborts early
+#   • get_attention_map() is unchanged (already gated by caller)
+#   • build_predict_fn_for_manager() unchanged
+#
+# IMPORTANT: app.py must ensure LIME and attention are called LAZILY
+# (only when the user selects a clause in the explorer), NOT during bulk
+# document analysis. This file only handles the explainer logic itself.
 from __future__ import annotations
 import re
 # ─────────────────────────────────────────────────────────────────────────────
+# Token filtering — display-time only (unchanged from v5.3)
 # ─────────────────────────────────────────────────────────────────────────────
 LIME_STOPWORDS = {
     "a", "an", "the",
     "be", "is", "are", "was", "were", "been", "being",
     "have", "has", "had", "do", "does", "did",
     "as", "if", "so", "than", "then",
+    "any", "all", "such", "no",
     "i", "we", "you", "they", "he", "she",
 }
 LIME_KEEP_LEGAL = {
     "shall", "may", "must", "not", "only", "unless", "except", "without",
     "subject", "liable", "liability", "indemnify", "indemnity", "indemnification",
     "confidential", "disclose", "non-compete", "non-solicit",
 }
 _ROMAN_NUMERAL = re.compile(r"^[ivxlcdm]+\.?$", re.IGNORECASE)
 def _clean_token(raw: str) -> str:
     return re.sub(r'^[^\w]+|[^\w]+$', '', raw).lower()
         return False
     if _ROMAN_NUMERAL.fullmatch(core):
         return False
     if sum(c.isalnum() for c in core) < 2:
         return False
     return True
 # LocalExplainer
 # ─────────────────────────────────────────────────────────────────────────────
 class LocalExplainer:
+    def __init__(
+        self,
+        num_samples: int = 25,          # v5.4: reduced from 50 → 25
+        timeout_seconds: float = 25.0,  # v5.4: tightened from 30 → 25 s
+    ):
         self.num_samples     = num_samples
         self.timeout_seconds = timeout_seconds
         self.last_error      = ""
         """
         Returns up to `display_count` filtered token contributions, sorted by
         absolute weight. The model itself still sees the original full text.
+        v5.4: Uses self.num_samples (default 25, down from 50).
+        Caller is responsible for invoking this only on-demand (lazy).
         """
         if not _LIME_AVAILABLE or self._lime is None:
             return []
                 text_instance=text[:1500],
                 classifier_fn=predict_fn,
                 num_features=num_features,
+                num_samples=self.num_samples,   # v5.4: 25 by default
                 labels=(1,),
             )
             elapsed = time.time() - t0
+            if elapsed > self.timeout_seconds:
+                print(f"[WARN] LIME took {elapsed:.1f}s > timeout {self.timeout_seconds}s")
             raw_pairs = exp.as_list(label=1)
             filtered: list[tuple[str, float]] = [
                 (w, float(s)) for w, s in raw_pairs if _is_useful_lime_token(w)
             ]

pdf_utils.py CHANGED Viewed

@@ -1,8 +1,12 @@
 # pdf_utils.py
-# v5.3 — finer-grained clause segmentation.
-# Adds:
-#   • Inline subclause splitting for long clauses (a), (b), (c), (i), (ii) …
-#   • Hard length cap with sentence-boundary fallback
 from __future__ import annotations
 import re
@@ -14,7 +18,177 @@ MIN_SUBCLAUSE_LEN = 60
 # ─────────────────────────────────────────────────────────────────────────────
-# PDF extraction (unchanged)
 # ─────────────────────────────────────────────────────────────────────────────
 def extract_text_from_pdf(file_path: str) -> str:
     import fitz
@@ -22,15 +196,12 @@ def extract_text_from_pdf(file_path: str) -> str:
     pages = [page.get_text("text") for page in doc]
     doc.close()
     raw = "\n".join(pages)
-    raw = re.sub(r'\r\n', '\n', raw)
-    raw = re.sub(r'[ \t]+', ' ', raw)
-    raw = re.sub(r'\n{3,}', '\n\n', raw)
-    raw = re.sub(r'(\w)-\n(\w)', r'\1\2', raw)
-    return raw.strip()
 # ─────────────────────────────────────────────────────────────────────────────
-# Header detection (primary segmentation)
 # ─────────────────────────────────────────────────────────────────────────────
 _HEADER_PATTERNS: list[tuple[str, re.Pattern]] = [
     ("decimal",  re.compile(r'(?m)^\s*(\d+(?:\.\d+){0,3}\.?)\s+(?=\S)')),
@@ -42,7 +213,6 @@ _HEADER_PATTERNS: list[tuple[str, re.Pattern]] = [
     ("caps",     re.compile(r'(?m)^([A-Z][A-Z0-9 &/\-]{4,59})\s*$')),
 ]
-# Inline subclause markers — used in the SECOND pass (mid-text, not line-start)
 _INLINE_SUBCLAUSE = re.compile(
     r'(?<=[\s\.\;\:])(\(\s*(?:[a-z]|[ivx]{1,4})\s*\))\s+(?=[A-Z\w])',
     re.IGNORECASE,
@@ -55,7 +225,6 @@ def _collect_headers(text: str) -> list[tuple[int, str, str]]:
         for m in pat.finditer(text):
             hits.append((m.start(1), m.group(1).strip(), kind))
     hits.sort(key=lambda h: h[0])
     deduped: list[tuple[int, str, str]] = []
     for h in hits:
         if not deduped or abs(h[0] - deduped[-1][0]) > 2:
@@ -64,25 +233,17 @@ def _collect_headers(text: str) -> list[tuple[int, str, str]]:
 # ─────────────────────────────────────────────────────────────────────────────
-# Inline subclause post-processing
 # ─────────────────────────────────────────────────────────────────────────────
 def _split_inline_subclauses(
     body: str,
     parent_number: str | None = None,
     min_length: int = MIN_SUBCLAUSE_LEN,
 ) -> list[dict]:
-    """
-    If the clause body contains ≥ 2 inline subclause markers, split it.
-    Subclause numbers are prefixed with the parent (e.g. "5.7" + "(a)" → "5.7(a)").
-    Returns [] if no useful split is possible (caller keeps the original).
-    """
     matches = list(_INLINE_SUBCLAUSE.finditer(body))
     if len(matches) < 2:
         return []
     parts: list[dict] = []
-    # First chunk: text before the first marker (usually the parent header line)
     head = body[:matches[0].start()].strip()
     if head and len(head) >= 30:
         parts.append({
@@ -90,17 +251,14 @@ def _split_inline_subclauses(
             "number": parent_number,
             "kind":   "decimal" if parent_number else "paragraph",
         })
     for i, m in enumerate(matches):
         start = m.start()
         end   = matches[i + 1].start() if i + 1 < len(matches) else len(body)
         chunk = body[start:end].strip()
         if len(chunk) < min_length:
-            # Too short to be a real subclause — fold into previous
             if parts:
                 parts[-1]["text"] = (parts[-1]["text"] + "\n" + chunk).strip()
             continue
         sub_marker = m.group(1).strip()
         composite  = f"{parent_number}{sub_marker}" if parent_number else sub_marker
         parts.append({
@@ -108,12 +266,10 @@ def _split_inline_subclauses(
             "number": composite,
             "kind":   "subclause",
         })
     return parts
 def _hard_cap_split(clause: dict, max_len: int = MAX_CLAUSE_CHARS) -> list[dict]:
-    """If still too long, split on sentence boundaries to bound LM input."""
     body = clause["text"]
     if len(body) <= max_len:
         return [clause]
@@ -127,7 +283,6 @@ def _hard_cap_split(clause: dict, max_len: int = MAX_CLAUSE_CHARS) -> list[dict]
             current = (current + " " + s).strip() if current else s
     if current:
         chunks.append(current.strip())
     return [
         {
             "text":   c,
@@ -145,9 +300,13 @@ def split_into_clauses_with_metadata(
     text: str,
     min_length: int = 40,
 ) -> list[dict]:
     headers = _collect_headers(text)
-    # ── Primary segmentation (heading-based) ───────────────────────────────
     primary: list[dict] = []
     if headers:
         for i, (start, marker, kind) in enumerate(headers):
@@ -162,6 +321,14 @@ def split_into_clauses_with_metadata(
             if len(p) >= min_length:
                 primary.append({"text": p, "number": None, "kind": "paragraph"})
     # ── Secondary pass: inline subclause splitting for long clauses ────────
     refined: list[dict] = []
     for clause in primary:
@@ -175,12 +342,19 @@ def split_into_clauses_with_metadata(
                 continue
         refined.append(clause)
-    # ── Tertiary pass: hard length cap (sentence-boundary chunking) ────────
     final: list[dict] = []
     for clause in refined:
         final.extend(_hard_cap_split(clause))
-    return final
 def split_into_clauses(text: str, min_length: int = 40) -> list[str]:

 # pdf_utils.py
+# v5.4 — Robust PDF preprocessing: TOC removal, garbage filtering, finer segmentation.
+# Changes vs v5.3:
+#   • extract_text_from_pdf() now calls clean_raw_pdf_text() after extraction
+#   • clean_raw_pdf_text() strips page numbers, separator lines, OCR noise,
+#     repeated doc titles, running headers/footers
+#   • is_toc_block() heuristic detects and rejects Table of Contents chunks
+#   • is_garbage_clause() rejects structurally empty / metadata-only chunks
+#   • split_into_clauses_with_metadata() integrates both filters before returning
 from __future__ import annotations
 import re
 # ─────────────────────────────────────────────────────────────────────────────
+# Step 1 — Raw text cleaning (runs immediately after PyMuPDF extraction)
+# ─────────────────────────────────────────────────────────────────────────────
+# Standalone page number line: e.g. "19", "- 3 -", "Page 4", "PAGE 4 OF 12"
+_PAGE_NUM_LINE = re.compile(
+    r'(?m)^[ \t]*(?:[-–—]*\s*)?(?:page\s+)?\d{1,4}(?:\s+of\s+\d{1,4})?'
+    r'(?:\s*[-–—]*)?[ \t]*$',
+    re.IGNORECASE,
+)
+# Roman-numeral-only lines (TOC page markers: i, ii, iii, iv, v, …)
+_ROMAN_PAGE_LINE = re.compile(
+    r'(?m)^[ \t]*[ivxlcdmIVXLCDM]{1,6}[ \t]*$'
+)
+# Horizontal separator lines: "___", "---", "===", "* * *", etc.
+_SEPARATOR_LINE = re.compile(
+    r'(?m)^[ \t]*[-=_*·•]{3,}[ \t]*$'
+)
+# Running header/footer patterns that repeat every page
+# e.g. "AGREEMENT AND PLAN OF MERGER", "CONFIDENTIAL", "EXECUTION VERSION"
+_RUNNING_HEADER = re.compile(
+    r'(?m)^[ \t]*(AGREEMENT AND PLAN OF|EXECUTION COPY|EXECUTION VERSION|'
+    r'CONFIDENTIAL|DRAFT|PRIVILEGED AND CONFIDENTIAL|'
+    r'EXHIBIT [A-Z]|SCHEDULE [A-Z\d])[^\n]*$',
+    re.IGNORECASE,
+)
+# TOC "dot-leader" lines: "Section 7.04 ............ 43"
+_TOC_DOT_LEADER = re.compile(
+    r'(?m)^[^\n]{5,80}[.\s]{4,}\s*\d{1,4}\s*$'
+)
+def clean_raw_pdf_text(raw: str) -> str:
+    """
+    Post-extraction cleaning: remove artefacts that corrupt clause segmentation.
+    The goal is NOT to remove legal content — only structural/metadata noise.
+    """
+    text = raw
+    # 1. Normalize line endings and excessive whitespace
+    text = re.sub(r'\r\n', '\n', text)
+    text = re.sub(r'[ \t]+', ' ', text)
+    text = re.sub(r'\n{4,}', '\n\n\n', text)
+    # 2. Remove TOC dot-leader lines BEFORE other cleanup (greedy match)
+    text = _TOC_DOT_LEADER.sub('', text)
+    # 3. Running headers / footers
+    text = _RUNNING_HEADER.sub('', text)
+    # 4. Standalone page numbers and roman numerals
+    text = _PAGE_NUM_LINE.sub('', text)
+    text = _ROMAN_PAGE_LINE.sub('', text)
+    # 5. Separator lines
+    text = _SEPARATOR_LINE.sub('', text)
+    # 6. "TABLE OF CONTENTS" heading itself (we will also filter the block below)
+    text = re.sub(
+        r'(?m)^[ \t]*TABLE\s+OF\s+CONTENTS[ \t]*$', '', text, flags=re.IGNORECASE
+    )
+    # 7. Collapse runs of blank lines left by removals
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    return text.strip()
+# ─────────────────────────────────────────────────────────────────────────────
+# Step 2 — TOC block detection (per-clause heuristic)
+# ─────────────────────────────────────────────────────────────────────────────
+# How many "Section X.XX" style references in a block makes it look like a TOC
+_TOC_SECTION_REF = re.compile(
+    r'(?:Section|ARTICLE|Article|SCHEDULE|Annex|Exhibit)\s+[\dIVXA-Z]',
+    re.IGNORECASE,
+)
+# A line that is ONLY a heading / short label (no sentence verb)
+_HEADING_ONLY_LINE = re.compile(
+    r'(?m)^[ \t]*[A-Z][A-Za-z0-9 &/\-]{2,50}[ \t]*$'
+)
+def is_toc_block(text: str) -> bool:
+    """
+    Return True if this chunk looks like a Table of Contents entry or
+    a run of section listings that are not real legal prose.
+    Heuristics (any one is sufficient to flag):
+      A. ≥ 4 "Section X.XX / ARTICLE X" references with very few full sentences
+      B. The heading-only-line density is > 60% of non-empty lines
+      C. Word count < 60 but section-reference count ≥ 3
+    """
+    lines = [l.strip() for l in text.splitlines() if l.strip()]
+    total_lines = len(lines)
+    if total_lines == 0:
+        return True  # empty → garbage
+    section_refs = len(_TOC_SECTION_REF.findall(text))
+    # Count lines that contain at least one verb-like word (rough sentence proxy)
+    sentence_lines = sum(
+        1 for l in lines
+        if re.search(r'\b(shall|will|may|must|agree|provide|require|include|'
+                     r'warrant|represent|indemnif|terminat|govern|licens|assign|'
+                     r'disclose|notify|maintain|ensure|permit|restrict)\b', l, re.I)
+    )
+    word_count = len(text.split())
+    # Heuristic A: many section refs, almost no substantive sentences
+    if section_refs >= 4 and sentence_lines <= max(1, total_lines * 0.15):
+        return True
+    # Heuristic B: very short and many section refs (classic TOC listing)
+    if word_count < 80 and section_refs >= 3:
+        return True
+    # Heuristic C: heading-only lines dominate
+    heading_lines = sum(1 for l in lines if _HEADING_ONLY_LINE.fullmatch(l))
+    if total_lines >= 4 and heading_lines / total_lines > 0.60:
+        return True
+    return False
+# ─────────────────────────────────────────────────────────────────────────────
+# Step 3 — Garbage clause filter (pre-inference gate)
+# ─────────────────────────────────────────────────────────────────────────────
+def is_garbage_clause(text: str, min_words: int = 15) -> bool:
+    """
+    Return True for chunks that should never reach the neural model:
+      • Too short to be a real clause
+      • Mostly digits / page references
+      • Mostly isolated section labels with no prose
+      • All-caps title-only blocks
+    """
+    words = text.split()
+    if len(words) < min_words:
+        return True
+    # Too many digit tokens (page-number contamination)
+    digit_ratio = sum(1 for w in words if w.strip('.,;:()').isdigit()) / len(words)
+    if digit_ratio > 0.35:
+        return True
+    # Too many "Section" / "Article" tokens relative to word count
+    struct_tokens = len(re.findall(
+        r'\b(?:Section|ARTICLE|Article|Exhibit|Schedule|Annex|Appendix|Part|Chapter)\b',
+        text, re.IGNORECASE,
+    ))
+    if struct_tokens / len(words) > 0.25:
+        return True
+    # No alphabetic word longer than 3 chars → pure noise / numbering block
+    if not any(len(w) > 3 and w.isalpha() for w in words):
+        return True
+    # Delegate to TOC detector
+    if is_toc_block(text):
+        return True
+    return False
+# ─────────────────────────────────────────────────────────────────────────────
+# PDF extraction (wraps clean step)
 # ─────────────────────────────────────────────────────────────────────────────
 def extract_text_from_pdf(file_path: str) -> str:
     import fitz
     pages = [page.get_text("text") for page in doc]
     doc.close()
     raw = "\n".join(pages)
+    raw = re.sub(r'(\w)-\n(\w)', r'\1\2', raw)   # de-hyphenate before cleaning
+    return clean_raw_pdf_text(raw)
 # ─────────────────────────────────────────────────────────────────────────────
+# Header detection (primary segmentation) — unchanged from v5.3
 # ─────────────────────────────────────────────────────────────────────────────
 _HEADER_PATTERNS: list[tuple[str, re.Pattern]] = [
     ("decimal",  re.compile(r'(?m)^\s*(\d+(?:\.\d+){0,3}\.?)\s+(?=\S)')),
     ("caps",     re.compile(r'(?m)^([A-Z][A-Z0-9 &/\-]{4,59})\s*$')),
 ]
 _INLINE_SUBCLAUSE = re.compile(
     r'(?<=[\s\.\;\:])(\(\s*(?:[a-z]|[ivx]{1,4})\s*\))\s+(?=[A-Z\w])',
     re.IGNORECASE,
         for m in pat.finditer(text):
             hits.append((m.start(1), m.group(1).strip(), kind))
     hits.sort(key=lambda h: h[0])
     deduped: list[tuple[int, str, str]] = []
     for h in hits:
         if not deduped or abs(h[0] - deduped[-1][0]) > 2:
 # ─────────────────────────────────────────────────────────────────────────────
+# Inline subclause splitting — unchanged from v5.3
 # ─────────────────────────────────────────────────────────────────────────────
 def _split_inline_subclauses(
     body: str,
     parent_number: str | None = None,
     min_length: int = MIN_SUBCLAUSE_LEN,
 ) -> list[dict]:
     matches = list(_INLINE_SUBCLAUSE.finditer(body))
     if len(matches) < 2:
         return []
     parts: list[dict] = []
     head = body[:matches[0].start()].strip()
     if head and len(head) >= 30:
         parts.append({
             "number": parent_number,
             "kind":   "decimal" if parent_number else "paragraph",
         })
     for i, m in enumerate(matches):
         start = m.start()
         end   = matches[i + 1].start() if i + 1 < len(matches) else len(body)
         chunk = body[start:end].strip()
         if len(chunk) < min_length:
             if parts:
                 parts[-1]["text"] = (parts[-1]["text"] + "\n" + chunk).strip()
             continue
         sub_marker = m.group(1).strip()
         composite  = f"{parent_number}{sub_marker}" if parent_number else sub_marker
         parts.append({
             "number": composite,
             "kind":   "subclause",
         })
     return parts
 def _hard_cap_split(clause: dict, max_len: int = MAX_CLAUSE_CHARS) -> list[dict]:
     body = clause["text"]
     if len(body) <= max_len:
         return [clause]
             current = (current + " " + s).strip() if current else s
     if current:
         chunks.append(current.strip())
     return [
         {
             "text":   c,
     text: str,
     min_length: int = 40,
 ) -> list[dict]:
+    """
+    Segment text into clauses, filter TOC/garbage, return clean list.
+    This is the single entry-point used by analyze_document().
+    """
     headers = _collect_headers(text)
+    # ── Primary segmentation (heading-based) ──────────────────────────────
     primary: list[dict] = []
     if headers:
         for i, (start, marker, kind) in enumerate(headers):
             if len(p) >= min_length:
                 primary.append({"text": p, "number": None, "kind": "paragraph"})
+    # ── TOC / garbage filter (NEW in v5.4) ────────────────────────────────
+    primary = [c for c in primary if not is_garbage_clause(c["text"])]
+    if not primary:
+        # If everything was filtered, fall back to treating the full text as one
+        # clause rather than returning an empty list (caller handles it).
+        return [{"text": text[:2000], "number": None, "kind": "paragraph"}]
     # ── Secondary pass: inline subclause splitting for long clauses ────────
     refined: list[dict] = []
     for clause in primary:
                 continue
         refined.append(clause)
+    # ── Tertiary pass: hard length cap ────────────────────────────────────
     final: list[dict] = []
     for clause in refined:
         final.extend(_hard_cap_split(clause))
+    # ── Final garbage sweep after splitting ───────────────────────────────
+    # Splitting can produce tiny chunks — filter them out too.
+    final = [c for c in final if not is_garbage_clause(c["text"])]
+    print(f"[INFO] Segmentation: {len(primary)} primary → "
+          f"{len(refined)} refined → {len(final)} final clean clauses")
+    return final if final else [{"text": text[:2000], "number": None, "kind": "paragraph"}]
 def split_into_clauses(text: str, min_length: int = 40) -> list[str]: