Spaces:

bilalRHCH
/

arabic-tts-api

Sleeping

File size: 2,293 Bytes

5b28f25

import re

def chunk_text(text: str, max_chunk_length: int = 300) -> list[str]:
    """
    Splits text into chunks, prioritizing paragraph breaks, then sentence terminators,
    then commas, and finally spaces. Ensure no word is chopped midway.
    """
    if not text:
        return []

    # Helper function to split by a delimiter and respect max length
    def _split_respecting_length(text_part, delimiter_pattern, sep=" "):
        parts = re.split(delimiter_pattern, text_part)
        res = []
        current = ""
        for p in parts:
            p = p.strip()
            if not p: continue
            
            if len(current) + len(p) + 1 <= max_chunk_length:
                current = f"{current}{sep}{p}" if current else p
            else:
                if current:
                    res.append(current)
                current = p
        if current:
            res.append(current)
        return res

    # 1. Paragraphs
    paragraphs = [p for p in re.split(r'\n+', text) if p.strip()]
    
    chunks = []
    for para in paragraphs:
        if len(para) <= max_chunk_length:
            chunks.append(para)
            continue
            
        # 2. Sentences
        sentences = []
        for p in _split_respecting_length(para, r'(?<=[.!?؟])\s+'):
            if len(p) <= max_chunk_length:
                sentences.append(p)
            else:
                # 3. Commas
                commas = []
                for c in _split_respecting_length(p, r'(?<=[,،])\s+'):
                    if len(c) <= max_chunk_length:
                        commas.append(c)
                    else:
                        # 4. Words
                        words_split = _split_respecting_length(c, r'\s+')
                        commas.extend(words_split)
                sentences.extend(commas)
        chunks.extend(sentences)
        
    return chunks

# Quick test if run directly
if __name__ == "__main__":
    test_text = "مرحباً بكم. هذا هو النص الأول! وهذا هو النص الثاني، الذي سنقوم بتقسيمه. " * 10
    print(f"Original text length: {len(test_text)}")
    res = chunk_text(test_text, max_chunk_length=100)
    for i, c in enumerate(res):
        print(f"Chunk {i+1} (len={len(c)}): {c}")