import re
from typing import List

class ClauseSegmenter:
    """
    A clause segmenter designed by T. de la Selle, Institut des Sources Chretiennes.
    It chunks text based on length constraints while greedily seeking 
    the most logical grammatical boundary (punctuation or conjunctions).
    """
    
    def __init__(self, min_words: int = 10, max_words: int = 35, merge_orphans: bool = True, linguistic_markers: set = None):
        self.min_words = min_words
        self.max_words = max_words
        self.merge_orphans = merge_orphans
        self.linguistic_markers = linguistic_markers or set()

    def _evaluate_word(self, word: str):
        """
        Evaluates a word to determine if it's a good split point.
        Returns a tuple: (priority_score, split_after_word)
        Lower priority score means it's a better split point.
        """
        # 1. Primary: End of sentence
        if any(word.endswith(p) for p in ['.', '!', '?']):
            return 1, True
            
        # 2. Secondary: Strong clause break
        if any(word.endswith(p) for p in [';', ':', '—']):
            return 2, True
            
        # 3. Tertiary: Weak clause break
        if word.endswith(','):
            return 3, True
            
        # 4. Linguistic Fallback: Conjunctions
        # Strip punctuation to check the raw word
        clean_word = re.sub(r'[^\w\s]', '', word.lower())
        if clean_word in self.linguistic_markers:
            # We want to split BEFORE the conjunction so it begins the next chunk
            return 4, False 
            
        return 99, True

    def segment(self, text: str) -> List[str]:
        """
        Segments the text into a list of strings (clauses/chunks).
        """
        if not text.strip():
            return []

        words = text.split()
        total_words = len(words)
        chunks = []
        start_idx = 0
        
        while start_idx < total_words:
            end_idx = start_idx
            best_split_idx = -1
            best_priority = 99
            best_split_idx_valid = -1
            best_priority_valid = 99
            
            # Scan a window up to max_words length
            while end_idx < total_words and (end_idx - start_idx) < self.max_words:
                word = words[end_idx]
                priority, split_after = self._evaluate_word(word)
                
                if priority < 99:
                    # Calculate the actual index where the split occurs
                    candidate_split_idx = end_idx if split_after else end_idx - 1
                    
                    # Ensure the proposed chunk meets the minimum length requirement
                    chunk_length = candidate_split_idx - start_idx + 1
                    if chunk_length >= self.min_words:
                        remaining = total_words - (candidate_split_idx + 1)
                        if remaining == 0 or remaining >= self.min_words:
                            if priority < best_priority_valid:
                                best_priority_valid = priority
                                best_split_idx_valid = candidate_split_idx
                            if priority == 1:
                                break
                        # If it's the best separator we've seen in this window, save it
                        if priority < best_priority:
                            best_priority = priority
                            best_split_idx = candidate_split_idx
                            
                end_idx += 1
            
            # Decide where to make the final cut
            if best_split_idx_valid != -1:
                split_point = best_split_idx_valid
            elif best_split_idx != -1:
                split_point = best_split_idx
            else:
                # No valid punctuation/marker found in the window. 
                # Force a split at max_words (or end of text).
                split_point = min(start_idx + self.max_words - 1, total_words - 1)

            remaining = total_words - (split_point + 1)
            if 0 < remaining < self.min_words:
                preferred_split = total_words - self.min_words - 1
                max_split = start_idx + self.max_words - 1
                if preferred_split >= start_idx and preferred_split <= max_split:
                    split_point = preferred_split
                
            # Extract the chunk and append
            chunk_words = words[start_idx : split_point + 1]
            chunks.append(" ".join(chunk_words))
            
            # Advance the starting index for the next window
            start_idx = split_point + 1

        # Post-processing: Handle orphan fragments at the end of the text
        if self.merge_orphans:
            final_chunks = []
            for chunk in chunks:
                chunk_len = len(chunk.split())
                # If this chunk is too small and we have a previous chunk, merge them
                if chunk_len < self.min_words and final_chunks:
                    prev_len = len(final_chunks[-1].split())
                    if prev_len + chunk_len <= self.max_words:
                        final_chunks[-1] += " " + chunk
                    else:
                        final_chunks.append(chunk)
                else:
                    final_chunks.append(chunk)
            return final_chunks
            
        return chunks

# ==========================================
# Example Usage on Augustine's Confessions
# ==========================================
# if __name__ == "__main__":
#     # A famous, long, complex passage from Augustine
#     augustine_text = (
#         "Fecisti nos ad te, Domine, et inquietum est cor nostrum donec requiescat in te. "
#         "Quoniam magnus es tu, et laudabilis valde: magna virtus tua, et sapientiae tuae non est numerus. "
#         "Et laudare te vult homo, aliqua portio creaturae tuae, et homo circumferens mortalitatem suam, "
#         "circumferens testimonium peccati sui, et testimonium quia superbis resistis."
#     )
    
#     # Latin conjunctions that naturally start a new clause or thought
#     linguistic_markers = {
#         'et', 'sed', 'quia', 'quoniam', 'autem', 
#         'enim', 'vero', 'nam', 'sicut', 'igitur', 'ergo'
#     }

#     # We set strict limits: minimum 5 words, maximum 10 words per chunk
#     segmenter = ClauseSegmenter(min_words=10, max_words=18, linguistic_markers=linguistic_markers)
    
#     clauses = segmenter.segment(augustine_text)
    
#     print("Segmented Clauses:")
#     for i, clause in enumerate(clauses):
#         word_count = len(clause.split())
#         print(f"[{i + 1}] ({word_count:2d} words): {clause}")