File size: 1,642 Bytes
4b022af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import re
from typing import List
def segment_into_clauses(full_text: str) -> List[str]:
"""
Segment the full document text into individual clauses using regex patterns.
Looks for common clause patterns like "1.", "1.1", "(a)", etc.
"""
# Define regex patterns for different clause formats
clause_patterns = [
r'\n\s*\d+\.\s+', # "1. ", "2. ", etc.
r'\n\s*\d+\.\d+\s+', # "1.1 ", "1.2 ", etc.
r'\n\s*\(\w+\)\s+', # "(a) ", "(b) ", etc.
r'\n\s*[ivx]+\.\s+', # "i. ", "ii. ", "iii. ", etc.
r'\n\s*[IVX]+\.\s+', # "I. ", "II. ", "III. ", etc.
r'\n\s*Article\s+\d+\s*:', # "Article 1:", "Article 2:", etc.
r'\n\s*Section\s+\d+\s*:', # "Section 1:", "Section 2:", etc.
r'\n\s*Clause\s+\d+\s*:', # "Clause 1:", "Clause 2:", etc.
]
# Combine all patterns with OR operator
combined_pattern = '|'.join(clause_patterns)
# Split text using the combined pattern
clauses = re.split(combined_pattern, full_text)
# Clean up the clauses
cleaned_clauses = []
for clause in clauses:
clause = clause.strip()
if len(clause) > 50: # Only include substantial clauses
cleaned_clauses.append(clause)
# If no clauses were found with the patterns, try a simpler approach
if len(cleaned_clauses) <= 1:
# Split by double newlines or periods followed by newlines
simple_clauses = re.split(r'\n\s*\n|\.\s*\n\s*[A-Z]', full_text)
cleaned_clauses = [clause.strip()
for clause in simple_clauses if len(clause.strip()) > 50]
return cleaned_clauses
|