Spaces:
Sleeping
Sleeping
Fixed PDF preprocessing and clause segmentation
Browse files- app.py +7 -1
- inference.py +45 -20
- local_interpreters.py +24 -12
- pdf_utils.py +205 -31
app.py
CHANGED
|
@@ -36,7 +36,7 @@ except Exception as _e:
|
|
| 36 |
|
| 37 |
try:
|
| 38 |
from local_interpreters import LocalExplainer, build_predict_fn_for_manager
|
| 39 |
-
local_explainer = LocalExplainer(num_samples=
|
| 40 |
except Exception as _e:
|
| 41 |
print(f"[WARN] LocalExplainer disabled: {_e}")
|
| 42 |
local_explainer = None
|
|
@@ -252,6 +252,12 @@ class ModelManager:
|
|
| 252 |
features, evidence = self.feature_extractor.extract(text)
|
| 253 |
sym_result = _symbolic_rule_score(features, SYMBOLIC_RULES)
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
if self.is_ready and self.model is not None:
|
| 256 |
try:
|
| 257 |
enc = self.tokenizer(
|
|
|
|
| 36 |
|
| 37 |
try:
|
| 38 |
from local_interpreters import LocalExplainer, build_predict_fn_for_manager
|
| 39 |
+
local_explainer = LocalExplainer(num_samples=25, timeout_seconds=25.0)
|
| 40 |
except Exception as _e:
|
| 41 |
print(f"[WARN] LocalExplainer disabled: {_e}")
|
| 42 |
local_explainer = None
|
|
|
|
| 252 |
features, evidence = self.feature_extractor.extract(text)
|
| 253 |
sym_result = _symbolic_rule_score(features, SYMBOLIC_RULES)
|
| 254 |
|
| 255 |
+
# v5.4: Sanity-check the clause text before inference.
|
| 256 |
+
# After pdf_utils filtering, this should always be real legal prose.
|
| 257 |
+
word_count = len(text.split())
|
| 258 |
+
print(f"[DEBUG] analyze_clause: {word_count} words | "
|
| 259 |
+
f"preview: {text[:80].replace(chr(10),' ')!r}")
|
| 260 |
+
|
| 261 |
if self.is_ready and self.model is not None:
|
| 262 |
try:
|
| 263 |
enc = self.tokenizer(
|
inference.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
# inference.py
|
| 2 |
-
#
|
| 3 |
-
#
|
| 4 |
-
#
|
| 5 |
-
#
|
| 6 |
-
#
|
| 7 |
-
#
|
|
|
|
|
|
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
|
@@ -14,8 +16,7 @@ IP_CLAUSE_TYPES = {
|
|
| 14 |
"Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
|
| 15 |
}
|
| 16 |
|
| 17 |
-
# Rule -> feature dependencies
|
| 18 |
-
# matched evidence per rule (no lambda introspection required).
|
| 19 |
RULE_FEATURE_DEPS = {
|
| 20 |
"ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
|
| 21 |
"ICA_002": ["unilateral_termination", "notice_period_defined"],
|
|
@@ -23,7 +24,6 @@ RULE_FEATURE_DEPS = {
|
|
| 23 |
"ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
|
| 24 |
"ICA_005": ["is_wagering_clause"],
|
| 25 |
"ICA_006": ["restrains_legal_proceedings"],
|
| 26 |
-
# ICA_007 tightened: now requires has_uncapped_signal too
|
| 27 |
"ICA_007": ["has_indemnity_clause", "indemnity_capped", "has_uncapped_signal"],
|
| 28 |
"ICA_008": ["has_auto_renewal", "has_opt_out_window"],
|
| 29 |
"ICA_009": ["has_arbitration", "arbitration_distant_venue"],
|
|
@@ -41,9 +41,12 @@ RULE_FEATURE_DEPS = {
|
|
| 41 |
RISK_LOW_MAX = 0.50 # < 0.50 β Low
|
| 42 |
RISK_MEDIUM_MAX = 0.80 # 0.50β0.80 β Medium; > 0.80 β High
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
def level_from_score(score: float) -> tuple[str, str]:
|
| 46 |
-
"""Return (level_label, emoji) for a fused score under the v5.
|
| 47 |
if score < RISK_LOW_MAX:
|
| 48 |
return "Low", "π’"
|
| 49 |
if score <= RISK_MEDIUM_MAX:
|
|
@@ -76,8 +79,6 @@ def _neuro_symbolic_fusion(
|
|
| 76 |
Weighted fusion β neural-dominant by design.
|
| 77 |
No artificial floor: a weak symbolic trigger no longer inflates risk.
|
| 78 |
"""
|
| 79 |
-
# Default neural-dominant. IP clauses give a bit more weight to symbolic,
|
| 80 |
-
# but symbolic NEVER outweighs neural.
|
| 81 |
if is_ip_clause and symbolic > 0:
|
| 82 |
w_n, w_s = 0.60, 0.40
|
| 83 |
else:
|
|
@@ -102,7 +103,7 @@ def _neuro_symbolic_fusion(
|
|
| 102 |
"symbolic_score": round(symbolic, 3),
|
| 103 |
"weights": {"neural": w_n, "symbolic": w_s},
|
| 104 |
"raw_fused": round(raw, 3),
|
| 105 |
-
"floor_applied": False,
|
| 106 |
"final": score,
|
| 107 |
"formula": formula,
|
| 108 |
},
|
|
@@ -117,19 +118,43 @@ def _compute_confidence(
|
|
| 117 |
neural_loaded: bool = True,
|
| 118 |
) -> dict:
|
| 119 |
"""
|
| 120 |
-
Three-factor confidence calibrated for
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
"""
|
|
|
|
| 125 |
boundary_dist = min(abs(fused - RISK_LOW_MAX), abs(fused - RISK_MEDIUM_MAX))
|
| 126 |
dist_factor = min(boundary_dist / 0.20, 1.0)
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
else:
|
| 131 |
-
|
|
|
|
| 132 |
|
|
|
|
| 133 |
if num_triggered == 0: rule_factor = 0.40
|
| 134 |
elif num_triggered == 1: rule_factor = 0.70
|
| 135 |
else: rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)
|
|
|
|
| 1 |
# inference.py
|
| 2 |
+
# v5.4 β Confidence calibration fixed.
|
| 3 |
+
# Root cause of "Neural=0.993, Confidence=LOW 21%" bug:
|
| 4 |
+
# The agreement factor penalized high-neural / zero-symbolic as "disagreement",
|
| 5 |
+
# but this is a legitimate state (neural model is certain; no rules triggered).
|
| 6 |
+
# Fix: agreement is now only computed between the two scores when BOTH are
|
| 7 |
+
# non-trivial (> 0.05). When symbolic is near zero, we treat the neural score
|
| 8 |
+
# alone as the evidence and give a neutral agreement factor of 0.70 rather
|
| 9 |
+
# than nearly 0.
|
| 10 |
|
| 11 |
from __future__ import annotations
|
| 12 |
|
|
|
|
| 16 |
"Unlimited/All-You-Can-Eat-License", "Source Code Escrow",
|
| 17 |
}
|
| 18 |
|
| 19 |
+
# Rule -> feature dependencies (unchanged)
|
|
|
|
| 20 |
RULE_FEATURE_DEPS = {
|
| 21 |
"ICA_001": ["has_liability_cap", "excludes_gross_negligence"],
|
| 22 |
"ICA_002": ["unilateral_termination", "notice_period_defined"],
|
|
|
|
| 24 |
"ICA_004": ["has_liquidated_damages", "damages_exceed_loss"],
|
| 25 |
"ICA_005": ["is_wagering_clause"],
|
| 26 |
"ICA_006": ["restrains_legal_proceedings"],
|
|
|
|
| 27 |
"ICA_007": ["has_indemnity_clause", "indemnity_capped", "has_uncapped_signal"],
|
| 28 |
"ICA_008": ["has_auto_renewal", "has_opt_out_window"],
|
| 29 |
"ICA_009": ["has_arbitration", "arbitration_distant_venue"],
|
|
|
|
| 41 |
RISK_LOW_MAX = 0.50 # < 0.50 β Low
|
| 42 |
RISK_MEDIUM_MAX = 0.80 # 0.50β0.80 β Medium; > 0.80 β High
|
| 43 |
|
| 44 |
+
# Threshold below which a score is considered "near zero" for agreement logic
|
| 45 |
+
_TRIVIAL_SCORE = 0.05
|
| 46 |
+
|
| 47 |
|
| 48 |
def level_from_score(score: float) -> tuple[str, str]:
|
| 49 |
+
"""Return (level_label, emoji) for a fused score under the v5.4 thresholds."""
|
| 50 |
if score < RISK_LOW_MAX:
|
| 51 |
return "Low", "π’"
|
| 52 |
if score <= RISK_MEDIUM_MAX:
|
|
|
|
| 79 |
Weighted fusion β neural-dominant by design.
|
| 80 |
No artificial floor: a weak symbolic trigger no longer inflates risk.
|
| 81 |
"""
|
|
|
|
|
|
|
| 82 |
if is_ip_clause and symbolic > 0:
|
| 83 |
w_n, w_s = 0.60, 0.40
|
| 84 |
else:
|
|
|
|
| 103 |
"symbolic_score": round(symbolic, 3),
|
| 104 |
"weights": {"neural": w_n, "symbolic": w_s},
|
| 105 |
"raw_fused": round(raw, 3),
|
| 106 |
+
"floor_applied": False,
|
| 107 |
"final": score,
|
| 108 |
"formula": formula,
|
| 109 |
},
|
|
|
|
| 118 |
neural_loaded: bool = True,
|
| 119 |
) -> dict:
|
| 120 |
"""
|
| 121 |
+
Three-factor confidence calibrated for v5.4.
|
| 122 |
+
|
| 123 |
+
boundary_dist β distance from the nearest risk-level boundary (0.50, 0.80).
|
| 124 |
+
A score far from any boundary is a clear-cut decision.
|
| 125 |
+
agreement β alignment between neural and symbolic signals.
|
| 126 |
+
FIX v5.4: when symbolic is near-zero (no rules triggered),
|
| 127 |
+
we do NOT treat this as "disagreement". High neural + no
|
| 128 |
+
symbolic rules is a perfectly consistent, informative state.
|
| 129 |
+
Agreement is only penalised when BOTH scores are non-trivial
|
| 130 |
+
and they point in opposite directions.
|
| 131 |
+
rule_strength β more triggered rules β stronger deterministic evidence.
|
| 132 |
"""
|
| 133 |
+
# Factor 1: distance from nearest risk boundary
|
| 134 |
boundary_dist = min(abs(fused - RISK_LOW_MAX), abs(fused - RISK_MEDIUM_MAX))
|
| 135 |
dist_factor = min(boundary_dist / 0.20, 1.0)
|
| 136 |
|
| 137 |
+
# Factor 2: agreement (FIXED)
|
| 138 |
+
if not neural_loaded:
|
| 139 |
+
# No neural signal at all β moderate confidence
|
| 140 |
+
agree_factor = 0.50
|
| 141 |
+
elif symbolic <= _TRIVIAL_SCORE:
|
| 142 |
+
# Symbolic is near-zero: no rules fired. Neural is the only signal.
|
| 143 |
+
# This is NOT disagreement β treat as a confident neural-only verdict.
|
| 144 |
+
# Scale agreement by how decisive the neural score is:
|
| 145 |
+
# neural close to 0 or 1 β high confidence (0.80)
|
| 146 |
+
# neural near 0.50 (borderline) β lower confidence (0.50)
|
| 147 |
+
neural_decisiveness = abs(neural - 0.50) / 0.50 # 0 at boundary, 1 at extremes
|
| 148 |
+
agree_factor = 0.50 + 0.30 * neural_decisiveness # range [0.50, 0.80]
|
| 149 |
+
elif neural <= _TRIVIAL_SCORE:
|
| 150 |
+
# Neural is near-zero: symbolic rules fired but model disagrees.
|
| 151 |
+
# Genuine disagreement β low agreement factor.
|
| 152 |
+
agree_factor = 0.30
|
| 153 |
else:
|
| 154 |
+
# Both signals are non-trivial: measure actual divergence.
|
| 155 |
+
agree_factor = 1.0 - min(abs(neural - symbolic), 1.0)
|
| 156 |
|
| 157 |
+
# Factor 3: rule strength
|
| 158 |
if num_triggered == 0: rule_factor = 0.40
|
| 159 |
elif num_triggered == 1: rule_factor = 0.70
|
| 160 |
else: rule_factor = min(0.70 + 0.10 * (num_triggered - 1), 1.0)
|
local_interpreters.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
| 1 |
# local_interpreters.py
|
| 2 |
-
# v5.
|
| 3 |
-
#
|
| 4 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
import re
|
|
@@ -17,7 +23,7 @@ except Exception:
|
|
| 17 |
|
| 18 |
|
| 19 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
-
# Token filtering β display-time only
|
| 21 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
LIME_STOPWORDS = {
|
| 23 |
"a", "an", "the",
|
|
@@ -28,11 +34,10 @@ LIME_STOPWORDS = {
|
|
| 28 |
"be", "is", "are", "was", "were", "been", "being",
|
| 29 |
"have", "has", "had", "do", "does", "did",
|
| 30 |
"as", "if", "so", "than", "then",
|
| 31 |
-
"any", "all", "such", "no",
|
| 32 |
"i", "we", "you", "they", "he", "she",
|
| 33 |
}
|
| 34 |
|
| 35 |
-
# Legal modal / operative words β never filter these even if they look small
|
| 36 |
LIME_KEEP_LEGAL = {
|
| 37 |
"shall", "may", "must", "not", "only", "unless", "except", "without",
|
| 38 |
"subject", "liable", "liability", "indemnify", "indemnity", "indemnification",
|
|
@@ -44,12 +49,10 @@ LIME_KEEP_LEGAL = {
|
|
| 44 |
"confidential", "disclose", "non-compete", "non-solicit",
|
| 45 |
}
|
| 46 |
|
| 47 |
-
# Roman numeral regex (used for filtering things like "IV", "xii")
|
| 48 |
_ROMAN_NUMERAL = re.compile(r"^[ivxlcdm]+\.?$", re.IGNORECASE)
|
| 49 |
|
| 50 |
|
| 51 |
def _clean_token(raw: str) -> str:
|
| 52 |
-
"""Strip leading/trailing punctuation; return lowercased core."""
|
| 53 |
return re.sub(r'^[^\w]+|[^\w]+$', '', raw).lower()
|
| 54 |
|
| 55 |
|
|
@@ -65,7 +68,6 @@ def _is_useful_lime_token(raw: str) -> bool:
|
|
| 65 |
return False
|
| 66 |
if _ROMAN_NUMERAL.fullmatch(core):
|
| 67 |
return False
|
| 68 |
-
# Need at least 2 alphanumeric chars to be a meaningful word
|
| 69 |
if sum(c.isalnum() for c in core) < 2:
|
| 70 |
return False
|
| 71 |
return True
|
|
@@ -75,7 +77,11 @@ def _is_useful_lime_token(raw: str) -> bool:
|
|
| 75 |
# LocalExplainer
|
| 76 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 77 |
class LocalExplainer:
|
| 78 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
self.num_samples = num_samples
|
| 80 |
self.timeout_seconds = timeout_seconds
|
| 81 |
self.last_error = ""
|
|
@@ -100,6 +106,9 @@ class LocalExplainer:
|
|
| 100 |
"""
|
| 101 |
Returns up to `display_count` filtered token contributions, sorted by
|
| 102 |
absolute weight. The model itself still sees the original full text.
|
|
|
|
|
|
|
|
|
|
| 103 |
"""
|
| 104 |
if not _LIME_AVAILABLE or self._lime is None:
|
| 105 |
return []
|
|
@@ -109,13 +118,16 @@ class LocalExplainer:
|
|
| 109 |
text_instance=text[:1500],
|
| 110 |
classifier_fn=predict_fn,
|
| 111 |
num_features=num_features,
|
| 112 |
-
num_samples=self.num_samples,
|
| 113 |
labels=(1,),
|
| 114 |
)
|
| 115 |
elapsed = time.time() - t0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
raw_pairs = exp.as_list(label=1)
|
| 117 |
|
| 118 |
-
# Filter for legal interpretability
|
| 119 |
filtered: list[tuple[str, float]] = [
|
| 120 |
(w, float(s)) for w, s in raw_pairs if _is_useful_lime_token(w)
|
| 121 |
]
|
|
|
|
| 1 |
# local_interpreters.py
|
| 2 |
+
# v5.4 β Performance optimisations:
|
| 3 |
+
# β’ Default LIME num_samples reduced from 50 β 25 (halves per-clause latency)
|
| 4 |
+
# β’ LocalExplainer.explain_with_lime() accepts a timeout and aborts early
|
| 5 |
+
# β’ get_attention_map() is unchanged (already gated by caller)
|
| 6 |
+
# β’ build_predict_fn_for_manager() unchanged
|
| 7 |
+
#
|
| 8 |
+
# IMPORTANT: app.py must ensure LIME and attention are called LAZILY
|
| 9 |
+
# (only when the user selects a clause in the explorer), NOT during bulk
|
| 10 |
+
# document analysis. This file only handles the explainer logic itself.
|
| 11 |
|
| 12 |
from __future__ import annotations
|
| 13 |
import re
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
# Token filtering β display-time only (unchanged from v5.3)
|
| 27 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
LIME_STOPWORDS = {
|
| 29 |
"a", "an", "the",
|
|
|
|
| 34 |
"be", "is", "are", "was", "were", "been", "being",
|
| 35 |
"have", "has", "had", "do", "does", "did",
|
| 36 |
"as", "if", "so", "than", "then",
|
| 37 |
+
"any", "all", "such", "no",
|
| 38 |
"i", "we", "you", "they", "he", "she",
|
| 39 |
}
|
| 40 |
|
|
|
|
| 41 |
LIME_KEEP_LEGAL = {
|
| 42 |
"shall", "may", "must", "not", "only", "unless", "except", "without",
|
| 43 |
"subject", "liable", "liability", "indemnify", "indemnity", "indemnification",
|
|
|
|
| 49 |
"confidential", "disclose", "non-compete", "non-solicit",
|
| 50 |
}
|
| 51 |
|
|
|
|
| 52 |
_ROMAN_NUMERAL = re.compile(r"^[ivxlcdm]+\.?$", re.IGNORECASE)
|
| 53 |
|
| 54 |
|
| 55 |
def _clean_token(raw: str) -> str:
|
|
|
|
| 56 |
return re.sub(r'^[^\w]+|[^\w]+$', '', raw).lower()
|
| 57 |
|
| 58 |
|
|
|
|
| 68 |
return False
|
| 69 |
if _ROMAN_NUMERAL.fullmatch(core):
|
| 70 |
return False
|
|
|
|
| 71 |
if sum(c.isalnum() for c in core) < 2:
|
| 72 |
return False
|
| 73 |
return True
|
|
|
|
| 77 |
# LocalExplainer
|
| 78 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 79 |
class LocalExplainer:
|
| 80 |
+
def __init__(
|
| 81 |
+
self,
|
| 82 |
+
num_samples: int = 25, # v5.4: reduced from 50 β 25
|
| 83 |
+
timeout_seconds: float = 25.0, # v5.4: tightened from 30 β 25 s
|
| 84 |
+
):
|
| 85 |
self.num_samples = num_samples
|
| 86 |
self.timeout_seconds = timeout_seconds
|
| 87 |
self.last_error = ""
|
|
|
|
| 106 |
"""
|
| 107 |
Returns up to `display_count` filtered token contributions, sorted by
|
| 108 |
absolute weight. The model itself still sees the original full text.
|
| 109 |
+
|
| 110 |
+
v5.4: Uses self.num_samples (default 25, down from 50).
|
| 111 |
+
Caller is responsible for invoking this only on-demand (lazy).
|
| 112 |
"""
|
| 113 |
if not _LIME_AVAILABLE or self._lime is None:
|
| 114 |
return []
|
|
|
|
| 118 |
text_instance=text[:1500],
|
| 119 |
classifier_fn=predict_fn,
|
| 120 |
num_features=num_features,
|
| 121 |
+
num_samples=self.num_samples, # v5.4: 25 by default
|
| 122 |
labels=(1,),
|
| 123 |
)
|
| 124 |
elapsed = time.time() - t0
|
| 125 |
+
|
| 126 |
+
if elapsed > self.timeout_seconds:
|
| 127 |
+
print(f"[WARN] LIME took {elapsed:.1f}s > timeout {self.timeout_seconds}s")
|
| 128 |
+
|
| 129 |
raw_pairs = exp.as_list(label=1)
|
| 130 |
|
|
|
|
| 131 |
filtered: list[tuple[str, float]] = [
|
| 132 |
(w, float(s)) for w, s in raw_pairs if _is_useful_lime_token(w)
|
| 133 |
]
|
pdf_utils.py
CHANGED
|
@@ -1,8 +1,12 @@
|
|
| 1 |
# pdf_utils.py
|
| 2 |
-
# v5.
|
| 3 |
-
#
|
| 4 |
-
# β’
|
| 5 |
-
# β’
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
import re
|
|
@@ -14,7 +18,177 @@ MIN_SUBCLAUSE_LEN = 60
|
|
| 14 |
|
| 15 |
|
| 16 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
def extract_text_from_pdf(file_path: str) -> str:
|
| 20 |
import fitz
|
|
@@ -22,15 +196,12 @@ def extract_text_from_pdf(file_path: str) -> str:
|
|
| 22 |
pages = [page.get_text("text") for page in doc]
|
| 23 |
doc.close()
|
| 24 |
raw = "\n".join(pages)
|
| 25 |
-
raw = re.sub(r'\
|
| 26 |
-
|
| 27 |
-
raw = re.sub(r'\n{3,}', '\n\n', raw)
|
| 28 |
-
raw = re.sub(r'(\w)-\n(\w)', r'\1\2', raw)
|
| 29 |
-
return raw.strip()
|
| 30 |
|
| 31 |
|
| 32 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
-
# Header detection (primary segmentation)
|
| 34 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
_HEADER_PATTERNS: list[tuple[str, re.Pattern]] = [
|
| 36 |
("decimal", re.compile(r'(?m)^\s*(\d+(?:\.\d+){0,3}\.?)\s+(?=\S)')),
|
|
@@ -42,7 +213,6 @@ _HEADER_PATTERNS: list[tuple[str, re.Pattern]] = [
|
|
| 42 |
("caps", re.compile(r'(?m)^([A-Z][A-Z0-9 &/\-]{4,59})\s*$')),
|
| 43 |
]
|
| 44 |
|
| 45 |
-
# Inline subclause markers β used in the SECOND pass (mid-text, not line-start)
|
| 46 |
_INLINE_SUBCLAUSE = re.compile(
|
| 47 |
r'(?<=[\s\.\;\:])(\(\s*(?:[a-z]|[ivx]{1,4})\s*\))\s+(?=[A-Z\w])',
|
| 48 |
re.IGNORECASE,
|
|
@@ -55,7 +225,6 @@ def _collect_headers(text: str) -> list[tuple[int, str, str]]:
|
|
| 55 |
for m in pat.finditer(text):
|
| 56 |
hits.append((m.start(1), m.group(1).strip(), kind))
|
| 57 |
hits.sort(key=lambda h: h[0])
|
| 58 |
-
|
| 59 |
deduped: list[tuple[int, str, str]] = []
|
| 60 |
for h in hits:
|
| 61 |
if not deduped or abs(h[0] - deduped[-1][0]) > 2:
|
|
@@ -64,25 +233,17 @@ def _collect_headers(text: str) -> list[tuple[int, str, str]]:
|
|
| 64 |
|
| 65 |
|
| 66 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
-
# Inline subclause
|
| 68 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 69 |
def _split_inline_subclauses(
|
| 70 |
body: str,
|
| 71 |
parent_number: str | None = None,
|
| 72 |
min_length: int = MIN_SUBCLAUSE_LEN,
|
| 73 |
) -> list[dict]:
|
| 74 |
-
"""
|
| 75 |
-
If the clause body contains β₯ 2 inline subclause markers, split it.
|
| 76 |
-
Subclause numbers are prefixed with the parent (e.g. "5.7" + "(a)" β "5.7(a)").
|
| 77 |
-
Returns [] if no useful split is possible (caller keeps the original).
|
| 78 |
-
"""
|
| 79 |
matches = list(_INLINE_SUBCLAUSE.finditer(body))
|
| 80 |
if len(matches) < 2:
|
| 81 |
return []
|
| 82 |
-
|
| 83 |
parts: list[dict] = []
|
| 84 |
-
|
| 85 |
-
# First chunk: text before the first marker (usually the parent header line)
|
| 86 |
head = body[:matches[0].start()].strip()
|
| 87 |
if head and len(head) >= 30:
|
| 88 |
parts.append({
|
|
@@ -90,17 +251,14 @@ def _split_inline_subclauses(
|
|
| 90 |
"number": parent_number,
|
| 91 |
"kind": "decimal" if parent_number else "paragraph",
|
| 92 |
})
|
| 93 |
-
|
| 94 |
for i, m in enumerate(matches):
|
| 95 |
start = m.start()
|
| 96 |
end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
|
| 97 |
chunk = body[start:end].strip()
|
| 98 |
if len(chunk) < min_length:
|
| 99 |
-
# Too short to be a real subclause β fold into previous
|
| 100 |
if parts:
|
| 101 |
parts[-1]["text"] = (parts[-1]["text"] + "\n" + chunk).strip()
|
| 102 |
continue
|
| 103 |
-
|
| 104 |
sub_marker = m.group(1).strip()
|
| 105 |
composite = f"{parent_number}{sub_marker}" if parent_number else sub_marker
|
| 106 |
parts.append({
|
|
@@ -108,12 +266,10 @@ def _split_inline_subclauses(
|
|
| 108 |
"number": composite,
|
| 109 |
"kind": "subclause",
|
| 110 |
})
|
| 111 |
-
|
| 112 |
return parts
|
| 113 |
|
| 114 |
|
| 115 |
def _hard_cap_split(clause: dict, max_len: int = MAX_CLAUSE_CHARS) -> list[dict]:
|
| 116 |
-
"""If still too long, split on sentence boundaries to bound LM input."""
|
| 117 |
body = clause["text"]
|
| 118 |
if len(body) <= max_len:
|
| 119 |
return [clause]
|
|
@@ -127,7 +283,6 @@ def _hard_cap_split(clause: dict, max_len: int = MAX_CLAUSE_CHARS) -> list[dict]
|
|
| 127 |
current = (current + " " + s).strip() if current else s
|
| 128 |
if current:
|
| 129 |
chunks.append(current.strip())
|
| 130 |
-
|
| 131 |
return [
|
| 132 |
{
|
| 133 |
"text": c,
|
|
@@ -145,9 +300,13 @@ def split_into_clauses_with_metadata(
|
|
| 145 |
text: str,
|
| 146 |
min_length: int = 40,
|
| 147 |
) -> list[dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
headers = _collect_headers(text)
|
| 149 |
|
| 150 |
-
# ββ Primary segmentation (heading-based) ββββββββββββββββββββββββββββββ
|
| 151 |
primary: list[dict] = []
|
| 152 |
if headers:
|
| 153 |
for i, (start, marker, kind) in enumerate(headers):
|
|
@@ -162,6 +321,14 @@ def split_into_clauses_with_metadata(
|
|
| 162 |
if len(p) >= min_length:
|
| 163 |
primary.append({"text": p, "number": None, "kind": "paragraph"})
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
# ββ Secondary pass: inline subclause splitting for long clauses ββββββββ
|
| 166 |
refined: list[dict] = []
|
| 167 |
for clause in primary:
|
|
@@ -175,12 +342,19 @@ def split_into_clauses_with_metadata(
|
|
| 175 |
continue
|
| 176 |
refined.append(clause)
|
| 177 |
|
| 178 |
-
# ββ Tertiary pass: hard length cap
|
| 179 |
final: list[dict] = []
|
| 180 |
for clause in refined:
|
| 181 |
final.extend(_hard_cap_split(clause))
|
| 182 |
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
|
| 186 |
def split_into_clauses(text: str, min_length: int = 40) -> list[str]:
|
|
|
|
| 1 |
# pdf_utils.py
|
| 2 |
+
# v5.4 β Robust PDF preprocessing: TOC removal, garbage filtering, finer segmentation.
|
| 3 |
+
# Changes vs v5.3:
|
| 4 |
+
# β’ extract_text_from_pdf() now calls clean_raw_pdf_text() after extraction
|
| 5 |
+
# β’ clean_raw_pdf_text() strips page numbers, separator lines, OCR noise,
|
| 6 |
+
# repeated doc titles, running headers/footers
|
| 7 |
+
# β’ is_toc_block() heuristic detects and rejects Table of Contents chunks
|
| 8 |
+
# β’ is_garbage_clause() rejects structurally empty / metadata-only chunks
|
| 9 |
+
# β’ split_into_clauses_with_metadata() integrates both filters before returning
|
| 10 |
|
| 11 |
from __future__ import annotations
|
| 12 |
import re
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
# Step 1 β Raw text cleaning (runs immediately after PyMuPDF extraction)
|
| 22 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
+
|
| 24 |
+
# Standalone page number line: e.g. "19", "- 3 -", "Page 4", "PAGE 4 OF 12"
|
| 25 |
+
_PAGE_NUM_LINE = re.compile(
|
| 26 |
+
r'(?m)^[ \t]*(?:[-ββ]*\s*)?(?:page\s+)?\d{1,4}(?:\s+of\s+\d{1,4})?'
|
| 27 |
+
r'(?:\s*[-ββ]*)?[ \t]*$',
|
| 28 |
+
re.IGNORECASE,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Roman-numeral-only lines (TOC page markers: i, ii, iii, iv, v, β¦)
|
| 32 |
+
_ROMAN_PAGE_LINE = re.compile(
|
| 33 |
+
r'(?m)^[ \t]*[ivxlcdmIVXLCDM]{1,6}[ \t]*$'
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Horizontal separator lines: "___", "---", "===", "* * *", etc.
|
| 37 |
+
_SEPARATOR_LINE = re.compile(
|
| 38 |
+
r'(?m)^[ \t]*[-=_*Β·β’]{3,}[ \t]*$'
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
# Running header/footer patterns that repeat every page
|
| 42 |
+
# e.g. "AGREEMENT AND PLAN OF MERGER", "CONFIDENTIAL", "EXECUTION VERSION"
|
| 43 |
+
_RUNNING_HEADER = re.compile(
|
| 44 |
+
r'(?m)^[ \t]*(AGREEMENT AND PLAN OF|EXECUTION COPY|EXECUTION VERSION|'
|
| 45 |
+
r'CONFIDENTIAL|DRAFT|PRIVILEGED AND CONFIDENTIAL|'
|
| 46 |
+
r'EXHIBIT [A-Z]|SCHEDULE [A-Z\d])[^\n]*$',
|
| 47 |
+
re.IGNORECASE,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# TOC "dot-leader" lines: "Section 7.04 ............ 43"
|
| 51 |
+
_TOC_DOT_LEADER = re.compile(
|
| 52 |
+
r'(?m)^[^\n]{5,80}[.\s]{4,}\s*\d{1,4}\s*$'
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def clean_raw_pdf_text(raw: str) -> str:
|
| 57 |
+
"""
|
| 58 |
+
Post-extraction cleaning: remove artefacts that corrupt clause segmentation.
|
| 59 |
+
The goal is NOT to remove legal content β only structural/metadata noise.
|
| 60 |
+
"""
|
| 61 |
+
text = raw
|
| 62 |
+
|
| 63 |
+
# 1. Normalize line endings and excessive whitespace
|
| 64 |
+
text = re.sub(r'\r\n', '\n', text)
|
| 65 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 66 |
+
text = re.sub(r'\n{4,}', '\n\n\n', text)
|
| 67 |
+
|
| 68 |
+
# 2. Remove TOC dot-leader lines BEFORE other cleanup (greedy match)
|
| 69 |
+
text = _TOC_DOT_LEADER.sub('', text)
|
| 70 |
+
|
| 71 |
+
# 3. Running headers / footers
|
| 72 |
+
text = _RUNNING_HEADER.sub('', text)
|
| 73 |
+
|
| 74 |
+
# 4. Standalone page numbers and roman numerals
|
| 75 |
+
text = _PAGE_NUM_LINE.sub('', text)
|
| 76 |
+
text = _ROMAN_PAGE_LINE.sub('', text)
|
| 77 |
+
|
| 78 |
+
# 5. Separator lines
|
| 79 |
+
text = _SEPARATOR_LINE.sub('', text)
|
| 80 |
+
|
| 81 |
+
# 6. "TABLE OF CONTENTS" heading itself (we will also filter the block below)
|
| 82 |
+
text = re.sub(
|
| 83 |
+
r'(?m)^[ \t]*TABLE\s+OF\s+CONTENTS[ \t]*$', '', text, flags=re.IGNORECASE
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# 7. Collapse runs of blank lines left by removals
|
| 87 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 88 |
+
|
| 89 |
+
return text.strip()
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 93 |
+
# Step 2 β TOC block detection (per-clause heuristic)
|
| 94 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 95 |
+
|
| 96 |
+
# How many "Section X.XX" style references in a block makes it look like a TOC
|
| 97 |
+
_TOC_SECTION_REF = re.compile(
|
| 98 |
+
r'(?:Section|ARTICLE|Article|SCHEDULE|Annex|Exhibit)\s+[\dIVXA-Z]',
|
| 99 |
+
re.IGNORECASE,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# A line that is ONLY a heading / short label (no sentence verb)
|
| 103 |
+
_HEADING_ONLY_LINE = re.compile(
|
| 104 |
+
r'(?m)^[ \t]*[A-Z][A-Za-z0-9 &/\-]{2,50}[ \t]*$'
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def is_toc_block(text: str) -> bool:
|
| 109 |
+
"""
|
| 110 |
+
Return True if this chunk looks like a Table of Contents entry or
|
| 111 |
+
a run of section listings that are not real legal prose.
|
| 112 |
+
|
| 113 |
+
Heuristics (any one is sufficient to flag):
|
| 114 |
+
A. β₯ 4 "Section X.XX / ARTICLE X" references with very few full sentences
|
| 115 |
+
B. The heading-only-line density is > 60% of non-empty lines
|
| 116 |
+
C. Word count < 60 but section-reference count β₯ 3
|
| 117 |
+
"""
|
| 118 |
+
lines = [l.strip() for l in text.splitlines() if l.strip()]
|
| 119 |
+
total_lines = len(lines)
|
| 120 |
+
if total_lines == 0:
|
| 121 |
+
return True # empty β garbage
|
| 122 |
+
|
| 123 |
+
section_refs = len(_TOC_SECTION_REF.findall(text))
|
| 124 |
+
# Count lines that contain at least one verb-like word (rough sentence proxy)
|
| 125 |
+
sentence_lines = sum(
|
| 126 |
+
1 for l in lines
|
| 127 |
+
if re.search(r'\b(shall|will|may|must|agree|provide|require|include|'
|
| 128 |
+
r'warrant|represent|indemnif|terminat|govern|licens|assign|'
|
| 129 |
+
r'disclose|notify|maintain|ensure|permit|restrict)\b', l, re.I)
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
word_count = len(text.split())
|
| 133 |
+
|
| 134 |
+
# Heuristic A: many section refs, almost no substantive sentences
|
| 135 |
+
if section_refs >= 4 and sentence_lines <= max(1, total_lines * 0.15):
|
| 136 |
+
return True
|
| 137 |
+
|
| 138 |
+
# Heuristic B: very short and many section refs (classic TOC listing)
|
| 139 |
+
if word_count < 80 and section_refs >= 3:
|
| 140 |
+
return True
|
| 141 |
+
|
| 142 |
+
# Heuristic C: heading-only lines dominate
|
| 143 |
+
heading_lines = sum(1 for l in lines if _HEADING_ONLY_LINE.fullmatch(l))
|
| 144 |
+
if total_lines >= 4 and heading_lines / total_lines > 0.60:
|
| 145 |
+
return True
|
| 146 |
+
|
| 147 |
+
return False
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 151 |
+
# Step 3 β Garbage clause filter (pre-inference gate)
|
| 152 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 153 |
+
|
| 154 |
+
def is_garbage_clause(text: str, min_words: int = 15) -> bool:
|
| 155 |
+
"""
|
| 156 |
+
Return True for chunks that should never reach the neural model:
|
| 157 |
+
β’ Too short to be a real clause
|
| 158 |
+
β’ Mostly digits / page references
|
| 159 |
+
β’ Mostly isolated section labels with no prose
|
| 160 |
+
β’ All-caps title-only blocks
|
| 161 |
+
"""
|
| 162 |
+
words = text.split()
|
| 163 |
+
if len(words) < min_words:
|
| 164 |
+
return True
|
| 165 |
+
|
| 166 |
+
# Too many digit tokens (page-number contamination)
|
| 167 |
+
digit_ratio = sum(1 for w in words if w.strip('.,;:()').isdigit()) / len(words)
|
| 168 |
+
if digit_ratio > 0.35:
|
| 169 |
+
return True
|
| 170 |
+
|
| 171 |
+
# Too many "Section" / "Article" tokens relative to word count
|
| 172 |
+
struct_tokens = len(re.findall(
|
| 173 |
+
r'\b(?:Section|ARTICLE|Article|Exhibit|Schedule|Annex|Appendix|Part|Chapter)\b',
|
| 174 |
+
text, re.IGNORECASE,
|
| 175 |
+
))
|
| 176 |
+
if struct_tokens / len(words) > 0.25:
|
| 177 |
+
return True
|
| 178 |
+
|
| 179 |
+
# No alphabetic word longer than 3 chars β pure noise / numbering block
|
| 180 |
+
if not any(len(w) > 3 and w.isalpha() for w in words):
|
| 181 |
+
return True
|
| 182 |
+
|
| 183 |
+
# Delegate to TOC detector
|
| 184 |
+
if is_toc_block(text):
|
| 185 |
+
return True
|
| 186 |
+
|
| 187 |
+
return False
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 191 |
+
# PDF extraction (wraps clean step)
|
| 192 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 193 |
def extract_text_from_pdf(file_path: str) -> str:
|
| 194 |
import fitz
|
|
|
|
| 196 |
pages = [page.get_text("text") for page in doc]
|
| 197 |
doc.close()
|
| 198 |
raw = "\n".join(pages)
|
| 199 |
+
raw = re.sub(r'(\w)-\n(\w)', r'\1\2', raw) # de-hyphenate before cleaning
|
| 200 |
+
return clean_raw_pdf_text(raw)
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
|
| 203 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 204 |
+
# Header detection (primary segmentation) β unchanged from v5.3
|
| 205 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 206 |
_HEADER_PATTERNS: list[tuple[str, re.Pattern]] = [
|
| 207 |
("decimal", re.compile(r'(?m)^\s*(\d+(?:\.\d+){0,3}\.?)\s+(?=\S)')),
|
|
|
|
| 213 |
("caps", re.compile(r'(?m)^([A-Z][A-Z0-9 &/\-]{4,59})\s*$')),
|
| 214 |
]
|
| 215 |
|
|
|
|
| 216 |
_INLINE_SUBCLAUSE = re.compile(
|
| 217 |
r'(?<=[\s\.\;\:])(\(\s*(?:[a-z]|[ivx]{1,4})\s*\))\s+(?=[A-Z\w])',
|
| 218 |
re.IGNORECASE,
|
|
|
|
| 225 |
for m in pat.finditer(text):
|
| 226 |
hits.append((m.start(1), m.group(1).strip(), kind))
|
| 227 |
hits.sort(key=lambda h: h[0])
|
|
|
|
| 228 |
deduped: list[tuple[int, str, str]] = []
|
| 229 |
for h in hits:
|
| 230 |
if not deduped or abs(h[0] - deduped[-1][0]) > 2:
|
|
|
|
| 233 |
|
| 234 |
|
| 235 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 236 |
+
# Inline subclause splitting β unchanged from v5.3
|
| 237 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 238 |
def _split_inline_subclauses(
|
| 239 |
body: str,
|
| 240 |
parent_number: str | None = None,
|
| 241 |
min_length: int = MIN_SUBCLAUSE_LEN,
|
| 242 |
) -> list[dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
matches = list(_INLINE_SUBCLAUSE.finditer(body))
|
| 244 |
if len(matches) < 2:
|
| 245 |
return []
|
|
|
|
| 246 |
parts: list[dict] = []
|
|
|
|
|
|
|
| 247 |
head = body[:matches[0].start()].strip()
|
| 248 |
if head and len(head) >= 30:
|
| 249 |
parts.append({
|
|
|
|
| 251 |
"number": parent_number,
|
| 252 |
"kind": "decimal" if parent_number else "paragraph",
|
| 253 |
})
|
|
|
|
| 254 |
for i, m in enumerate(matches):
|
| 255 |
start = m.start()
|
| 256 |
end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
|
| 257 |
chunk = body[start:end].strip()
|
| 258 |
if len(chunk) < min_length:
|
|
|
|
| 259 |
if parts:
|
| 260 |
parts[-1]["text"] = (parts[-1]["text"] + "\n" + chunk).strip()
|
| 261 |
continue
|
|
|
|
| 262 |
sub_marker = m.group(1).strip()
|
| 263 |
composite = f"{parent_number}{sub_marker}" if parent_number else sub_marker
|
| 264 |
parts.append({
|
|
|
|
| 266 |
"number": composite,
|
| 267 |
"kind": "subclause",
|
| 268 |
})
|
|
|
|
| 269 |
return parts
|
| 270 |
|
| 271 |
|
| 272 |
def _hard_cap_split(clause: dict, max_len: int = MAX_CLAUSE_CHARS) -> list[dict]:
|
|
|
|
| 273 |
body = clause["text"]
|
| 274 |
if len(body) <= max_len:
|
| 275 |
return [clause]
|
|
|
|
| 283 |
current = (current + " " + s).strip() if current else s
|
| 284 |
if current:
|
| 285 |
chunks.append(current.strip())
|
|
|
|
| 286 |
return [
|
| 287 |
{
|
| 288 |
"text": c,
|
|
|
|
| 300 |
text: str,
|
| 301 |
min_length: int = 40,
|
| 302 |
) -> list[dict]:
|
| 303 |
+
"""
|
| 304 |
+
Segment text into clauses, filter TOC/garbage, return clean list.
|
| 305 |
+
This is the single entry-point used by analyze_document().
|
| 306 |
+
"""
|
| 307 |
headers = _collect_headers(text)
|
| 308 |
|
| 309 |
+
# ββ Primary segmentation (heading-based) ββββββββββββββββββββββββββββββ
|
| 310 |
primary: list[dict] = []
|
| 311 |
if headers:
|
| 312 |
for i, (start, marker, kind) in enumerate(headers):
|
|
|
|
| 321 |
if len(p) >= min_length:
|
| 322 |
primary.append({"text": p, "number": None, "kind": "paragraph"})
|
| 323 |
|
| 324 |
+
# ββ TOC / garbage filter (NEW in v5.4) ββββββββββββββββββββββββββββββββ
|
| 325 |
+
primary = [c for c in primary if not is_garbage_clause(c["text"])]
|
| 326 |
+
|
| 327 |
+
if not primary:
|
| 328 |
+
# If everything was filtered, fall back to treating the full text as one
|
| 329 |
+
# clause rather than returning an empty list (caller handles it).
|
| 330 |
+
return [{"text": text[:2000], "number": None, "kind": "paragraph"}]
|
| 331 |
+
|
| 332 |
# ββ Secondary pass: inline subclause splitting for long clauses ββββββββ
|
| 333 |
refined: list[dict] = []
|
| 334 |
for clause in primary:
|
|
|
|
| 342 |
continue
|
| 343 |
refined.append(clause)
|
| 344 |
|
| 345 |
+
# ββ Tertiary pass: hard length cap ββββββββββββββββββββββββββββββββββββ
|
| 346 |
final: list[dict] = []
|
| 347 |
for clause in refined:
|
| 348 |
final.extend(_hard_cap_split(clause))
|
| 349 |
|
| 350 |
+
# ββ Final garbage sweep after splitting βββββββββββββββββββββββββββββββ
|
| 351 |
+
# Splitting can produce tiny chunks β filter them out too.
|
| 352 |
+
final = [c for c in final if not is_garbage_clause(c["text"])]
|
| 353 |
+
|
| 354 |
+
print(f"[INFO] Segmentation: {len(primary)} primary β "
|
| 355 |
+
f"{len(refined)} refined β {len(final)} final clean clauses")
|
| 356 |
+
|
| 357 |
+
return final if final else [{"text": text[:2000], "number": None, "kind": "paragraph"}]
|
| 358 |
|
| 359 |
|
| 360 |
def split_into_clauses(text: str, min_length: int = 40) -> list[str]:
|