File size: 16,195 Bytes
28db5b3
6893de4
 
 
 
 
 
 
 
28db5b3
 
 
 
 
af910e9
 
 
92625e7
 
af910e9
6893de4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af910e9
 
 
 
 
28db5b3
 
6893de4
 
28db5b3
 
af910e9
6893de4
af910e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6893de4
af910e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92625e7
af910e9
 
 
92625e7
af910e9
92625e7
 
28db5b3
af910e9
 
 
 
 
 
 
6893de4
 
 
 
af910e9
 
6893de4
af910e9
 
 
 
 
 
 
 
 
 
 
 
 
 
6893de4
 
 
 
 
 
 
 
af910e9
 
 
 
 
 
 
92625e7
af910e9
 
 
 
92625e7
6893de4
af910e9
 
 
92625e7
6893de4
 
 
 
 
 
 
 
92625e7
 
af910e9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
# pdf_utils.py
# v5.4 β€” Robust PDF preprocessing: TOC removal, garbage filtering, finer segmentation.
# Changes vs v5.3:
#   β€’ extract_text_from_pdf() now calls clean_raw_pdf_text() after extraction
#   β€’ clean_raw_pdf_text() strips page numbers, separator lines, OCR noise,
#     repeated doc titles, running headers/footers
#   β€’ is_toc_block() heuristic detects and rejects Table of Contents chunks
#   β€’ is_garbage_clause() rejects structurally empty / metadata-only chunks
#   β€’ split_into_clauses_with_metadata() integrates both filters before returning

from __future__ import annotations
import re


LONG_CLAUSE_CHARS = 1200
MAX_CLAUSE_CHARS  = 3000
MIN_SUBCLAUSE_LEN = 60


# ─────────────────────────────────────────────────────────────────────────────
# Step 1 β€” Raw text cleaning (runs immediately after PyMuPDF extraction)
# ─────────────────────────────────────────────────────────────────────────────

# Standalone page number line: e.g. "19", "- 3 -", "Page 4", "PAGE 4 OF 12"
_PAGE_NUM_LINE = re.compile(
    r'(?m)^[ \t]*(?:[-–—]*\s*)?(?:page\s+)?\d{1,4}(?:\s+of\s+\d{1,4})?'
    r'(?:\s*[-–—]*)?[ \t]*$',
    re.IGNORECASE,
)

# Roman-numeral-only lines (TOC page markers: i, ii, iii, iv, v, …)
_ROMAN_PAGE_LINE = re.compile(
    r'(?m)^[ \t]*[ivxlcdmIVXLCDM]{1,6}[ \t]*$'
)

# Horizontal separator lines: "___", "---", "===", "* * *", etc.
_SEPARATOR_LINE = re.compile(
    r'(?m)^[ \t]*[-=_*Β·β€’]{3,}[ \t]*$'
)

# Running header/footer patterns that repeat every page
# e.g. "AGREEMENT AND PLAN OF MERGER", "CONFIDENTIAL", "EXECUTION VERSION"
_RUNNING_HEADER = re.compile(
    r'(?m)^[ \t]*(AGREEMENT AND PLAN OF|EXECUTION COPY|EXECUTION VERSION|'
    r'CONFIDENTIAL|DRAFT|PRIVILEGED AND CONFIDENTIAL|'
    r'EXHIBIT [A-Z]|SCHEDULE [A-Z\d])[^\n]*$',
    re.IGNORECASE,
)

# TOC "dot-leader" lines: "Section 7.04 ............ 43"
_TOC_DOT_LEADER = re.compile(
    r'(?m)^[^\n]{5,80}[.\s]{4,}\s*\d{1,4}\s*$'
)


def clean_raw_pdf_text(raw: str) -> str:
    """

    Post-extraction cleaning: remove artefacts that corrupt clause segmentation.

    The goal is NOT to remove legal content β€” only structural/metadata noise.

    """
    text = raw

    # 1. Normalize line endings and excessive whitespace
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n{4,}', '\n\n\n', text)

    # 2. Remove TOC dot-leader lines BEFORE other cleanup (greedy match)
    text = _TOC_DOT_LEADER.sub('', text)

    # 3. Running headers / footers
    text = _RUNNING_HEADER.sub('', text)

    # 4. Standalone page numbers and roman numerals
    text = _PAGE_NUM_LINE.sub('', text)
    text = _ROMAN_PAGE_LINE.sub('', text)

    # 5. Separator lines
    text = _SEPARATOR_LINE.sub('', text)

    # 6. "TABLE OF CONTENTS" heading itself (we will also filter the block below)
    text = re.sub(
        r'(?m)^[ \t]*TABLE\s+OF\s+CONTENTS[ \t]*$', '', text, flags=re.IGNORECASE
    )

    # 7. Collapse runs of blank lines left by removals
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()


# ─────────────────────────────────────────────────────────────────────────────
# Step 2 β€” TOC block detection (per-clause heuristic)
# ─────────────────────────────────────────────────────────────────────────────

# How many "Section X.XX" style references in a block makes it look like a TOC
_TOC_SECTION_REF = re.compile(
    r'(?:Section|ARTICLE|Article|SCHEDULE|Annex|Exhibit)\s+[\dIVXA-Z]',
    re.IGNORECASE,
)

# A line that is ONLY a heading / short label (no sentence verb)
_HEADING_ONLY_LINE = re.compile(
    r'(?m)^[ \t]*[A-Z][A-Za-z0-9 &/\-]{2,50}[ \t]*$'
)


def is_toc_block(text: str) -> bool:
    """

    Return True if this chunk looks like a Table of Contents entry or

    a run of section listings that are not real legal prose.



    Heuristics (any one is sufficient to flag):

      A. β‰₯ 4 "Section X.XX / ARTICLE X" references with very few full sentences

      B. The heading-only-line density is > 60% of non-empty lines

      C. Word count < 60 but section-reference count β‰₯ 3

    """
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    total_lines = len(lines)
    if total_lines == 0:
        return True  # empty β†’ garbage

    section_refs = len(_TOC_SECTION_REF.findall(text))
    # Count lines that contain at least one verb-like word (rough sentence proxy)
    sentence_lines = sum(
        1 for l in lines
        if re.search(r'\b(shall|will|may|must|agree|provide|require|include|'
                     r'warrant|represent|indemnif|terminat|govern|licens|assign|'
                     r'disclose|notify|maintain|ensure|permit|restrict)\b', l, re.I)
    )

    word_count = len(text.split())

    # Heuristic A: many section refs, almost no substantive sentences
    if section_refs >= 4 and sentence_lines <= max(1, total_lines * 0.15):
        return True

    # Heuristic B: very short and many section refs (classic TOC listing)
    if word_count < 80 and section_refs >= 3:
        return True

    # Heuristic C: heading-only lines dominate
    heading_lines = sum(1 for l in lines if _HEADING_ONLY_LINE.fullmatch(l))
    if total_lines >= 4 and heading_lines / total_lines > 0.60:
        return True

    return False


# ─────────────────────────────────────────────────────────────────────────────
# Step 3 β€” Garbage clause filter (pre-inference gate)
# ─────────────────────────────────────────────────────────────────────────────

def is_garbage_clause(text: str, min_words: int = 15) -> bool:
    """

    Return True for chunks that should never reach the neural model:

      β€’ Too short to be a real clause

      β€’ Mostly digits / page references

      β€’ Mostly isolated section labels with no prose

      β€’ All-caps title-only blocks

    """
    words = text.split()
    if len(words) < min_words:
        return True

    # Too many digit tokens (page-number contamination)
    digit_ratio = sum(1 for w in words if w.strip('.,;:()').isdigit()) / len(words)
    if digit_ratio > 0.35:
        return True

    # Too many "Section" / "Article" tokens relative to word count
    struct_tokens = len(re.findall(
        r'\b(?:Section|ARTICLE|Article|Exhibit|Schedule|Annex|Appendix|Part|Chapter)\b',
        text, re.IGNORECASE,
    ))
    if struct_tokens / len(words) > 0.25:
        return True

    # No alphabetic word longer than 3 chars β†’ pure noise / numbering block
    if not any(len(w) > 3 and w.isalpha() for w in words):
        return True

    # Delegate to TOC detector
    if is_toc_block(text):
        return True

    return False


# ─────────────────────────────────────────────────────────────────────────────
# PDF extraction (wraps clean step)
# ─────────────────────────────────────────────────────────────────────────────
def extract_text_from_pdf(file_path: str) -> str:
    import fitz
    doc   = fitz.open(file_path)
    pages = [page.get_text("text") for page in doc]
    doc.close()
    raw = "\n".join(pages)
    raw = re.sub(r'(\w)-\n(\w)', r'\1\2', raw)   # de-hyphenate before cleaning
    return clean_raw_pdf_text(raw)


# ─────────────────────────────────────────────────────────────────────────────
# Header detection (primary segmentation) β€” unchanged from v5.3
# ─────────────────────────────────────────────────────────────────────────────
_HEADER_PATTERNS: list[tuple[str, re.Pattern]] = [
    ("decimal",  re.compile(r'(?m)^\s*(\d+(?:\.\d+){0,3}\.?)\s+(?=\S)')),
    ("article",  re.compile(
        r'(?m)^\s*((?:Article|Section|Clause|Schedule|Annexure|Annex|Appendix|Part|Chapter)'
        r'\s+(?:\d+(?:\.\d+){0,2}|[IVXLC]+))[\s\.\-:]', re.IGNORECASE)),
    ("lettered", re.compile(r'(?m)^\s*(\(\s*[a-zA-Z]{1,4}\s*\))\s+(?=\S)')),
    ("roman",    re.compile(r'(?m)^\s*([IVX]{1,5}\.)\s+(?=\S)')),
    ("caps",     re.compile(r'(?m)^([A-Z][A-Z0-9 &/\-]{4,59})\s*$')),
]

_INLINE_SUBCLAUSE = re.compile(
    r'(?<=[\s\.\;\:])(\(\s*(?:[a-z]|[ivx]{1,4})\s*\))\s+(?=[A-Z\w])',
    re.IGNORECASE,
)


def _collect_headers(text: str) -> list[tuple[int, str, str]]:
    hits: list[tuple[int, str, str]] = []
    for kind, pat in _HEADER_PATTERNS:
        for m in pat.finditer(text):
            hits.append((m.start(1), m.group(1).strip(), kind))
    hits.sort(key=lambda h: h[0])
    deduped: list[tuple[int, str, str]] = []
    for h in hits:
        if not deduped or abs(h[0] - deduped[-1][0]) > 2:
            deduped.append(h)
    return deduped


# ─────────────────────────────────────────────────────────────────────────────
# Inline subclause splitting β€” unchanged from v5.3
# ─────────────────────────────────────────────────────────────────────────────
def _split_inline_subclauses(

    body: str,

    parent_number: str | None = None,

    min_length: int = MIN_SUBCLAUSE_LEN,

) -> list[dict]:
    matches = list(_INLINE_SUBCLAUSE.finditer(body))
    if len(matches) < 2:
        return []
    parts: list[dict] = []
    head = body[:matches[0].start()].strip()
    if head and len(head) >= 30:
        parts.append({
            "text":   head,
            "number": parent_number,
            "kind":   "decimal" if parent_number else "paragraph",
        })
    for i, m in enumerate(matches):
        start = m.start()
        end   = matches[i + 1].start() if i + 1 < len(matches) else len(body)
        chunk = body[start:end].strip()
        if len(chunk) < min_length:
            if parts:
                parts[-1]["text"] = (parts[-1]["text"] + "\n" + chunk).strip()
            continue
        sub_marker = m.group(1).strip()
        composite  = f"{parent_number}{sub_marker}" if parent_number else sub_marker
        parts.append({
            "text":   chunk,
            "number": composite,
            "kind":   "subclause",
        })
    return parts


def _hard_cap_split(clause: dict, max_len: int = MAX_CLAUSE_CHARS) -> list[dict]:
    body = clause["text"]
    if len(body) <= max_len:
        return [clause]
    sentences = re.split(r'(?<=[\.\?\!])\s+(?=[A-Z])', body)
    chunks, current = [], ""
    for s in sentences:
        if len(current) + len(s) + 1 > max_len and current:
            chunks.append(current.strip())
            current = s
        else:
            current = (current + " " + s).strip() if current else s
    if current:
        chunks.append(current.strip())
    return [
        {
            "text":   c,
            "number": clause.get("number"),
            "kind":   clause.get("kind", "paragraph") + "/chunked",
        }
        for c in chunks if len(c) >= MIN_SUBCLAUSE_LEN
    ]


# ─────────────────────────────────────────────────────────────────────────────
# Public API
# ─────────────────────────────────────────────────────────────────────────────
def split_into_clauses_with_metadata(

    text: str,

    min_length: int = 40,

) -> list[dict]:
    """

    Segment text into clauses, filter TOC/garbage, return clean list.

    This is the single entry-point used by analyze_document().

    """
    headers = _collect_headers(text)

    # ── Primary segmentation (heading-based) ──────────────────────────────
    primary: list[dict] = []
    if headers:
        for i, (start, marker, kind) in enumerate(headers):
            end  = headers[i + 1][0] if i + 1 < len(headers) else len(text)
            body = text[start:end].strip()
            if len(body) >= min_length:
                primary.append({"text": body, "number": marker, "kind": kind})

    # Paragraph fallback when no headers were found
    if not primary:
        for p in [p.strip() for p in re.split(r'\n\s*\n', text)]:
            if len(p) >= min_length:
                primary.append({"text": p, "number": None, "kind": "paragraph"})

    # ── TOC / garbage filter (NEW in v5.4) ────────────────────────────────
    primary = [c for c in primary if not is_garbage_clause(c["text"])]

    if not primary:
        # If everything was filtered, fall back to treating the full text as one
        # clause rather than returning an empty list (caller handles it).
        return [{"text": text[:2000], "number": None, "kind": "paragraph"}]

    # ── Secondary pass: inline subclause splitting for long clauses ────────
    refined: list[dict] = []
    for clause in primary:
        if len(clause["text"]) > LONG_CLAUSE_CHARS:
            subs = _split_inline_subclauses(
                clause["text"],
                parent_number=clause.get("number"),
            )
            if subs:
                refined.extend(subs)
                continue
        refined.append(clause)

    # ── Tertiary pass: hard length cap ────────────────────────────────────
    final: list[dict] = []
    for clause in refined:
        final.extend(_hard_cap_split(clause))

    # ── Final garbage sweep after splitting ───────────────────────────────
    # Splitting can produce tiny chunks β€” filter them out too.
    final = [c for c in final if not is_garbage_clause(c["text"])]

    print(f"[INFO] Segmentation: {len(primary)} primary β†’ "
          f"{len(refined)} refined β†’ {len(final)} final clean clauses")

    return final if final else [{"text": text[:2000], "number": None, "kind": "paragraph"}]


def split_into_clauses(text: str, min_length: int = 40) -> list[str]:
    """Backward-compat wrapper that returns plain strings."""
    return [c["text"] for c in split_into_clauses_with_metadata(text, min_length)]