Spaces:
Running
Running
| import re | |
| from typing import List | |
| class ClauseSegmenter: | |
| """ | |
| A clause segmenter designed by T. de la Selle, Institut des Sources Chretiennes. | |
| It chunks text based on length constraints while greedily seeking | |
| the most logical grammatical boundary (punctuation or conjunctions). | |
| """ | |
| def __init__(self, min_words: int = 10, max_words: int = 35, merge_orphans: bool = True, linguistic_markers: set = None): | |
| self.min_words = min_words | |
| self.max_words = max_words | |
| self.merge_orphans = merge_orphans | |
| self.linguistic_markers = linguistic_markers or set() | |
| def _evaluate_word(self, word: str): | |
| """ | |
| Evaluates a word to determine if it's a good split point. | |
| Returns a tuple: (priority_score, split_after_word) | |
| Lower priority score means it's a better split point. | |
| """ | |
| # 1. Primary: End of sentence | |
| if any(word.endswith(p) for p in ['.', '!', '?']): | |
| return 1, True | |
| # 2. Secondary: Strong clause break | |
| if any(word.endswith(p) for p in [';', ':', '—']): | |
| return 2, True | |
| # 3. Tertiary: Weak clause break | |
| if word.endswith(','): | |
| return 3, True | |
| # 4. Linguistic Fallback: Conjunctions | |
| # Strip punctuation to check the raw word | |
| clean_word = re.sub(r'[^\w\s]', '', word.lower()) | |
| if clean_word in self.linguistic_markers: | |
| # We want to split BEFORE the conjunction so it begins the next chunk | |
| return 4, False | |
| return 99, True | |
| def segment(self, text: str) -> List[str]: | |
| """ | |
| Segments the text into a list of strings (clauses/chunks). | |
| """ | |
| if not text.strip(): | |
| return [] | |
| words = text.split() | |
| total_words = len(words) | |
| chunks = [] | |
| start_idx = 0 | |
| while start_idx < total_words: | |
| end_idx = start_idx | |
| best_split_idx = -1 | |
| best_priority = 99 | |
| best_split_idx_valid = -1 | |
| best_priority_valid = 99 | |
| # Scan a window up to max_words length | |
| while end_idx < total_words and (end_idx - start_idx) < self.max_words: | |
| word = words[end_idx] | |
| priority, split_after = self._evaluate_word(word) | |
| if priority < 99: | |
| # Calculate the actual index where the split occurs | |
| candidate_split_idx = end_idx if split_after else end_idx - 1 | |
| # Ensure the proposed chunk meets the minimum length requirement | |
| chunk_length = candidate_split_idx - start_idx + 1 | |
| if chunk_length >= self.min_words: | |
| remaining = total_words - (candidate_split_idx + 1) | |
| if remaining == 0 or remaining >= self.min_words: | |
| if priority < best_priority_valid: | |
| best_priority_valid = priority | |
| best_split_idx_valid = candidate_split_idx | |
| if priority == 1: | |
| break | |
| # If it's the best separator we've seen in this window, save it | |
| if priority < best_priority: | |
| best_priority = priority | |
| best_split_idx = candidate_split_idx | |
| end_idx += 1 | |
| # Decide where to make the final cut | |
| if best_split_idx_valid != -1: | |
| split_point = best_split_idx_valid | |
| elif best_split_idx != -1: | |
| split_point = best_split_idx | |
| else: | |
| # No valid punctuation/marker found in the window. | |
| # Force a split at max_words (or end of text). | |
| split_point = min(start_idx + self.max_words - 1, total_words - 1) | |
| remaining = total_words - (split_point + 1) | |
| if 0 < remaining < self.min_words: | |
| preferred_split = total_words - self.min_words - 1 | |
| max_split = start_idx + self.max_words - 1 | |
| if preferred_split >= start_idx and preferred_split <= max_split: | |
| split_point = preferred_split | |
| # Extract the chunk and append | |
| chunk_words = words[start_idx : split_point + 1] | |
| chunks.append(" ".join(chunk_words)) | |
| # Advance the starting index for the next window | |
| start_idx = split_point + 1 | |
| # Post-processing: Handle orphan fragments at the end of the text | |
| if self.merge_orphans: | |
| final_chunks = [] | |
| for chunk in chunks: | |
| chunk_len = len(chunk.split()) | |
| # If this chunk is too small and we have a previous chunk, merge them | |
| if chunk_len < self.min_words and final_chunks: | |
| prev_len = len(final_chunks[-1].split()) | |
| if prev_len + chunk_len <= self.max_words: | |
| final_chunks[-1] += " " + chunk | |
| else: | |
| final_chunks.append(chunk) | |
| else: | |
| final_chunks.append(chunk) | |
| return final_chunks | |
| return chunks | |
| # ========================================== | |
| # Example Usage on Augustine's Confessions | |
| # ========================================== | |
| # if __name__ == "__main__": | |
| # # A famous, long, complex passage from Augustine | |
| # augustine_text = ( | |
| # "Fecisti nos ad te, Domine, et inquietum est cor nostrum donec requiescat in te. " | |
| # "Quoniam magnus es tu, et laudabilis valde: magna virtus tua, et sapientiae tuae non est numerus. " | |
| # "Et laudare te vult homo, aliqua portio creaturae tuae, et homo circumferens mortalitatem suam, " | |
| # "circumferens testimonium peccati sui, et testimonium quia superbis resistis." | |
| # ) | |
| # # Latin conjunctions that naturally start a new clause or thought | |
| # linguistic_markers = { | |
| # 'et', 'sed', 'quia', 'quoniam', 'autem', | |
| # 'enim', 'vero', 'nam', 'sicut', 'igitur', 'ergo' | |
| # } | |
| # # We set strict limits: minimum 5 words, maximum 10 words per chunk | |
| # segmenter = ClauseSegmenter(min_words=10, max_words=18, linguistic_markers=linguistic_markers) | |
| # clauses = segmenter.segment(augustine_text) | |
| # print("Segmented Clauses:") | |
| # for i, clause in enumerate(clauses): | |
| # word_count = len(clause.split()) | |
| # print(f"[{i + 1}] ({word_count:2d} words): {clause}") |