import re from typing import List class ClauseSegmenter: """ A clause segmenter designed by T. de la Selle, Institut des Sources Chretiennes. It chunks text based on length constraints while greedily seeking the most logical grammatical boundary (punctuation or conjunctions). """ def __init__(self, min_words: int = 10, max_words: int = 35, merge_orphans: bool = True, linguistic_markers: set = None): self.min_words = min_words self.max_words = max_words self.merge_orphans = merge_orphans self.linguistic_markers = linguistic_markers or set() def _evaluate_word(self, word: str): """ Evaluates a word to determine if it's a good split point. Returns a tuple: (priority_score, split_after_word) Lower priority score means it's a better split point. """ # 1. Primary: End of sentence if any(word.endswith(p) for p in ['.', '!', '?']): return 1, True # 2. Secondary: Strong clause break if any(word.endswith(p) for p in [';', ':', '—']): return 2, True # 3. Tertiary: Weak clause break if word.endswith(','): return 3, True # 4. Linguistic Fallback: Conjunctions # Strip punctuation to check the raw word clean_word = re.sub(r'[^\w\s]', '', word.lower()) if clean_word in self.linguistic_markers: # We want to split BEFORE the conjunction so it begins the next chunk return 4, False return 99, True def segment(self, text: str) -> List[str]: """ Segments the text into a list of strings (clauses/chunks). """ if not text.strip(): return [] words = text.split() total_words = len(words) chunks = [] start_idx = 0 while start_idx < total_words: end_idx = start_idx best_split_idx = -1 best_priority = 99 best_split_idx_valid = -1 best_priority_valid = 99 # Scan a window up to max_words length while end_idx < total_words and (end_idx - start_idx) < self.max_words: word = words[end_idx] priority, split_after = self._evaluate_word(word) if priority < 99: # Calculate the actual index where the split occurs candidate_split_idx = end_idx if split_after else end_idx - 1 # Ensure the proposed chunk meets the minimum length requirement chunk_length = candidate_split_idx - start_idx + 1 if chunk_length >= self.min_words: remaining = total_words - (candidate_split_idx + 1) if remaining == 0 or remaining >= self.min_words: if priority < best_priority_valid: best_priority_valid = priority best_split_idx_valid = candidate_split_idx if priority == 1: break # If it's the best separator we've seen in this window, save it if priority < best_priority: best_priority = priority best_split_idx = candidate_split_idx end_idx += 1 # Decide where to make the final cut if best_split_idx_valid != -1: split_point = best_split_idx_valid elif best_split_idx != -1: split_point = best_split_idx else: # No valid punctuation/marker found in the window. # Force a split at max_words (or end of text). split_point = min(start_idx + self.max_words - 1, total_words - 1) remaining = total_words - (split_point + 1) if 0 < remaining < self.min_words: preferred_split = total_words - self.min_words - 1 max_split = start_idx + self.max_words - 1 if preferred_split >= start_idx and preferred_split <= max_split: split_point = preferred_split # Extract the chunk and append chunk_words = words[start_idx : split_point + 1] chunks.append(" ".join(chunk_words)) # Advance the starting index for the next window start_idx = split_point + 1 # Post-processing: Handle orphan fragments at the end of the text if self.merge_orphans: final_chunks = [] for chunk in chunks: chunk_len = len(chunk.split()) # If this chunk is too small and we have a previous chunk, merge them if chunk_len < self.min_words and final_chunks: prev_len = len(final_chunks[-1].split()) if prev_len + chunk_len <= self.max_words: final_chunks[-1] += " " + chunk else: final_chunks.append(chunk) else: final_chunks.append(chunk) return final_chunks return chunks # ========================================== # Example Usage on Augustine's Confessions # ========================================== # if __name__ == "__main__": # # A famous, long, complex passage from Augustine # augustine_text = ( # "Fecisti nos ad te, Domine, et inquietum est cor nostrum donec requiescat in te. " # "Quoniam magnus es tu, et laudabilis valde: magna virtus tua, et sapientiae tuae non est numerus. " # "Et laudare te vult homo, aliqua portio creaturae tuae, et homo circumferens mortalitatem suam, " # "circumferens testimonium peccati sui, et testimonium quia superbis resistis." # ) # # Latin conjunctions that naturally start a new clause or thought # linguistic_markers = { # 'et', 'sed', 'quia', 'quoniam', 'autem', # 'enim', 'vero', 'nam', 'sicut', 'igitur', 'ergo' # } # # We set strict limits: minimum 5 words, maximum 10 words per chunk # segmenter = ClauseSegmenter(min_words=10, max_words=18, linguistic_markers=linguistic_markers) # clauses = segmenter.segment(augustine_text) # print("Segmented Clauses:") # for i, clause in enumerate(clauses): # word_count = len(clause.split()) # print(f"[{i + 1}] ({word_count:2d} words): {clause}")