File size: 1,642 Bytes
4b022af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re
from typing import List


def segment_into_clauses(full_text: str) -> List[str]:
    """
    Segment the full document text into individual clauses using regex patterns.
    Looks for common clause patterns like "1.", "1.1", "(a)", etc.
    """

    # Define regex patterns for different clause formats
    clause_patterns = [
        r'\n\s*\d+\.\s+',  # "1. ", "2. ", etc.
        r'\n\s*\d+\.\d+\s+',  # "1.1 ", "1.2 ", etc.
        r'\n\s*\(\w+\)\s+',  # "(a) ", "(b) ", etc.
        r'\n\s*[ivx]+\.\s+',  # "i. ", "ii. ", "iii. ", etc.
        r'\n\s*[IVX]+\.\s+',  # "I. ", "II. ", "III. ", etc.
        r'\n\s*Article\s+\d+\s*:',  # "Article 1:", "Article 2:", etc.
        r'\n\s*Section\s+\d+\s*:',  # "Section 1:", "Section 2:", etc.
        r'\n\s*Clause\s+\d+\s*:',  # "Clause 1:", "Clause 2:", etc.
    ]

    # Combine all patterns with OR operator
    combined_pattern = '|'.join(clause_patterns)

    # Split text using the combined pattern
    clauses = re.split(combined_pattern, full_text)

    # Clean up the clauses
    cleaned_clauses = []
    for clause in clauses:
        clause = clause.strip()
        if len(clause) > 50:  # Only include substantial clauses
            cleaned_clauses.append(clause)

    # If no clauses were found with the patterns, try a simpler approach
    if len(cleaned_clauses) <= 1:
        # Split by double newlines or periods followed by newlines
        simple_clauses = re.split(r'\n\s*\n|\.\s*\n\s*[A-Z]', full_text)
        cleaned_clauses = [clause.strip()
                           for clause in simple_clauses if len(clause.strip()) > 50]

    return cleaned_clauses