Spaces:

TdelaSelle
/

Paraphrasis

Running

Théotime de la Selle

Minor improvments

966b15c 9 days ago

6.74 kB

	import re
	from typing import List

	class ClauseSegmenter:
	"""
	A clause segmenter designed by T. de la Selle, Institut des Sources Chretiennes.
	It chunks text based on length constraints while greedily seeking
	the most logical grammatical boundary (punctuation or conjunctions).
	"""

	def __init__(self, min_words: int = 10, max_words: int = 35, merge_orphans: bool = True, linguistic_markers: set = None):
	self.min_words = min_words
	self.max_words = max_words
	self.merge_orphans = merge_orphans
	self.linguistic_markers = linguistic_markers or set()

	def _evaluate_word(self, word: str):
	"""
	Evaluates a word to determine if it's a good split point.
	Returns a tuple: (priority_score, split_after_word)
	Lower priority score means it's a better split point.
	"""
	# 1. Primary: End of sentence
	if any(word.endswith(p) for p in ['.', '!', '?']):
	return 1, True

	# 2. Secondary: Strong clause break
	if any(word.endswith(p) for p in [';', ':', '—']):
	return 2, True

	# 3. Tertiary: Weak clause break
	if word.endswith(','):
	return 3, True

	# 4. Linguistic Fallback: Conjunctions
	# Strip punctuation to check the raw word
	clean_word = re.sub(r'[^\w\s]', '', word.lower())
	if clean_word in self.linguistic_markers:
	# We want to split BEFORE the conjunction so it begins the next chunk
	return 4, False

	return 99, True

	def segment(self, text: str) -> List[str]:
	"""
	Segments the text into a list of strings (clauses/chunks).
	"""
	if not text.strip():
	return []

	words = text.split()
	total_words = len(words)
	chunks = []
	start_idx = 0

	while start_idx < total_words:
	end_idx = start_idx
	best_split_idx = -1
	best_priority = 99
	best_split_idx_valid = -1
	best_priority_valid = 99

	# Scan a window up to max_words length
	while end_idx < total_words and (end_idx - start_idx) < self.max_words:
	word = words[end_idx]
	priority, split_after = self._evaluate_word(word)

	if priority < 99:
	# Calculate the actual index where the split occurs
	candidate_split_idx = end_idx if split_after else end_idx - 1

	# Ensure the proposed chunk meets the minimum length requirement
	chunk_length = candidate_split_idx - start_idx + 1
	if chunk_length >= self.min_words:
	remaining = total_words - (candidate_split_idx + 1)
	if remaining == 0 or remaining >= self.min_words:
	if priority < best_priority_valid:
	best_priority_valid = priority
	best_split_idx_valid = candidate_split_idx
	if priority == 1:
	break
	# If it's the best separator we've seen in this window, save it
	if priority < best_priority:
	best_priority = priority
	best_split_idx = candidate_split_idx

	end_idx += 1

	# Decide where to make the final cut
	if best_split_idx_valid != -1:
	split_point = best_split_idx_valid
	elif best_split_idx != -1:
	split_point = best_split_idx
	else:
	# No valid punctuation/marker found in the window.
	# Force a split at max_words (or end of text).
	split_point = min(start_idx + self.max_words - 1, total_words - 1)

	remaining = total_words - (split_point + 1)
	if 0 < remaining < self.min_words:
	preferred_split = total_words - self.min_words - 1
	max_split = start_idx + self.max_words - 1
	if preferred_split >= start_idx and preferred_split <= max_split:
	split_point = preferred_split

	# Extract the chunk and append
	chunk_words = words[start_idx : split_point + 1]
	chunks.append(" ".join(chunk_words))

	# Advance the starting index for the next window
	start_idx = split_point + 1

	# Post-processing: Handle orphan fragments at the end of the text
	if self.merge_orphans:
	final_chunks = []
	for chunk in chunks:
	chunk_len = len(chunk.split())
	# If this chunk is too small and we have a previous chunk, merge them
	if chunk_len < self.min_words and final_chunks:
	prev_len = len(final_chunks[-1].split())
	if prev_len + chunk_len <= self.max_words:
	final_chunks[-1] += " " + chunk
	else:
	final_chunks.append(chunk)
	else:
	final_chunks.append(chunk)
	return final_chunks

	return chunks

	# ==========================================
	# Example Usage on Augustine's Confessions
	# ==========================================
	# if __name__ == "__main__":
	# # A famous, long, complex passage from Augustine
	# augustine_text = (
	# "Fecisti nos ad te, Domine, et inquietum est cor nostrum donec requiescat in te. "
	# "Quoniam magnus es tu, et laudabilis valde: magna virtus tua, et sapientiae tuae non est numerus. "
	# "Et laudare te vult homo, aliqua portio creaturae tuae, et homo circumferens mortalitatem suam, "
	# "circumferens testimonium peccati sui, et testimonium quia superbis resistis."
	# )

	# # Latin conjunctions that naturally start a new clause or thought
	# linguistic_markers = {
	# 'et', 'sed', 'quia', 'quoniam', 'autem',
	# 'enim', 'vero', 'nam', 'sicut', 'igitur', 'ergo'
	# }

	# # We set strict limits: minimum 5 words, maximum 10 words per chunk
	# segmenter = ClauseSegmenter(min_words=10, max_words=18, linguistic_markers=linguistic_markers)

	# clauses = segmenter.segment(augustine_text)

	# print("Segmented Clauses:")
	# for i, clause in enumerate(clauses):
	# word_count = len(clause.split())
	# print(f"[{i + 1}] ({word_count:2d} words): {clause}")