import re from typing import List def segment_into_clauses(full_text: str) -> List[str]: """ Segment the full document text into individual clauses using regex patterns. Looks for common clause patterns like "1.", "1.1", "(a)", etc. """ # Define regex patterns for different clause formats clause_patterns = [ r'\n\s*\d+\.\s+', # "1. ", "2. ", etc. r'\n\s*\d+\.\d+\s+', # "1.1 ", "1.2 ", etc. r'\n\s*\(\w+\)\s+', # "(a) ", "(b) ", etc. r'\n\s*[ivx]+\.\s+', # "i. ", "ii. ", "iii. ", etc. r'\n\s*[IVX]+\.\s+', # "I. ", "II. ", "III. ", etc. r'\n\s*Article\s+\d+\s*:', # "Article 1:", "Article 2:", etc. r'\n\s*Section\s+\d+\s*:', # "Section 1:", "Section 2:", etc. r'\n\s*Clause\s+\d+\s*:', # "Clause 1:", "Clause 2:", etc. ] # Combine all patterns with OR operator combined_pattern = '|'.join(clause_patterns) # Split text using the combined pattern clauses = re.split(combined_pattern, full_text) # Clean up the clauses cleaned_clauses = [] for clause in clauses: clause = clause.strip() if len(clause) > 50: # Only include substantial clauses cleaned_clauses.append(clause) # If no clauses were found with the patterns, try a simpler approach if len(cleaned_clauses) <= 1: # Split by double newlines or periods followed by newlines simple_clauses = re.split(r'\n\s*\n|\.\s*\n\s*[A-Z]', full_text) cleaned_clauses = [clause.strip() for clause in simple_clauses if len(clause.strip()) > 50] return cleaned_clauses