Paraphrasis / segmenter.py
Théotime de la Selle
Minor improvments
966b15c
import re
from typing import List
class ClauseSegmenter:
"""
A clause segmenter designed by T. de la Selle, Institut des Sources Chretiennes.
It chunks text based on length constraints while greedily seeking
the most logical grammatical boundary (punctuation or conjunctions).
"""
def __init__(self, min_words: int = 10, max_words: int = 35, merge_orphans: bool = True, linguistic_markers: set = None):
self.min_words = min_words
self.max_words = max_words
self.merge_orphans = merge_orphans
self.linguistic_markers = linguistic_markers or set()
def _evaluate_word(self, word: str):
"""
Evaluates a word to determine if it's a good split point.
Returns a tuple: (priority_score, split_after_word)
Lower priority score means it's a better split point.
"""
# 1. Primary: End of sentence
if any(word.endswith(p) for p in ['.', '!', '?']):
return 1, True
# 2. Secondary: Strong clause break
if any(word.endswith(p) for p in [';', ':', '—']):
return 2, True
# 3. Tertiary: Weak clause break
if word.endswith(','):
return 3, True
# 4. Linguistic Fallback: Conjunctions
# Strip punctuation to check the raw word
clean_word = re.sub(r'[^\w\s]', '', word.lower())
if clean_word in self.linguistic_markers:
# We want to split BEFORE the conjunction so it begins the next chunk
return 4, False
return 99, True
def segment(self, text: str) -> List[str]:
"""
Segments the text into a list of strings (clauses/chunks).
"""
if not text.strip():
return []
words = text.split()
total_words = len(words)
chunks = []
start_idx = 0
while start_idx < total_words:
end_idx = start_idx
best_split_idx = -1
best_priority = 99
best_split_idx_valid = -1
best_priority_valid = 99
# Scan a window up to max_words length
while end_idx < total_words and (end_idx - start_idx) < self.max_words:
word = words[end_idx]
priority, split_after = self._evaluate_word(word)
if priority < 99:
# Calculate the actual index where the split occurs
candidate_split_idx = end_idx if split_after else end_idx - 1
# Ensure the proposed chunk meets the minimum length requirement
chunk_length = candidate_split_idx - start_idx + 1
if chunk_length >= self.min_words:
remaining = total_words - (candidate_split_idx + 1)
if remaining == 0 or remaining >= self.min_words:
if priority < best_priority_valid:
best_priority_valid = priority
best_split_idx_valid = candidate_split_idx
if priority == 1:
break
# If it's the best separator we've seen in this window, save it
if priority < best_priority:
best_priority = priority
best_split_idx = candidate_split_idx
end_idx += 1
# Decide where to make the final cut
if best_split_idx_valid != -1:
split_point = best_split_idx_valid
elif best_split_idx != -1:
split_point = best_split_idx
else:
# No valid punctuation/marker found in the window.
# Force a split at max_words (or end of text).
split_point = min(start_idx + self.max_words - 1, total_words - 1)
remaining = total_words - (split_point + 1)
if 0 < remaining < self.min_words:
preferred_split = total_words - self.min_words - 1
max_split = start_idx + self.max_words - 1
if preferred_split >= start_idx and preferred_split <= max_split:
split_point = preferred_split
# Extract the chunk and append
chunk_words = words[start_idx : split_point + 1]
chunks.append(" ".join(chunk_words))
# Advance the starting index for the next window
start_idx = split_point + 1
# Post-processing: Handle orphan fragments at the end of the text
if self.merge_orphans:
final_chunks = []
for chunk in chunks:
chunk_len = len(chunk.split())
# If this chunk is too small and we have a previous chunk, merge them
if chunk_len < self.min_words and final_chunks:
prev_len = len(final_chunks[-1].split())
if prev_len + chunk_len <= self.max_words:
final_chunks[-1] += " " + chunk
else:
final_chunks.append(chunk)
else:
final_chunks.append(chunk)
return final_chunks
return chunks
# ==========================================
# Example Usage on Augustine's Confessions
# ==========================================
# if __name__ == "__main__":
# # A famous, long, complex passage from Augustine
# augustine_text = (
# "Fecisti nos ad te, Domine, et inquietum est cor nostrum donec requiescat in te. "
# "Quoniam magnus es tu, et laudabilis valde: magna virtus tua, et sapientiae tuae non est numerus. "
# "Et laudare te vult homo, aliqua portio creaturae tuae, et homo circumferens mortalitatem suam, "
# "circumferens testimonium peccati sui, et testimonium quia superbis resistis."
# )
# # Latin conjunctions that naturally start a new clause or thought
# linguistic_markers = {
# 'et', 'sed', 'quia', 'quoniam', 'autem',
# 'enim', 'vero', 'nam', 'sicut', 'igitur', 'ergo'
# }
# # We set strict limits: minimum 5 words, maximum 10 words per chunk
# segmenter = ClauseSegmenter(min_words=10, max_words=18, linguistic_markers=linguistic_markers)
# clauses = segmenter.segment(augustine_text)
# print("Segmented Clauses:")
# for i, clause in enumerate(clauses):
# word_count = len(clause.split())
# print(f"[{i + 1}] ({word_count:2d} words): {clause}")