import re from typing import List class PersianSentenceSplitter: def __init__(self, max_chars: int = 200, min_chars: int = 50): self.max_chars = max_chars self.min_chars = min_chars self.sentence_endings = r'[.!?؟۔]' self.weak_boundaries = r'[،,;؛]' def clean_text(self, text: str) -> str: text = re.sub(r'\s+', ' ', text) text = text.replace('_', '\u200c') text = text.replace('ك', 'ک').replace('ي', 'ی') persian_digits = '۰۱۲۳۴۵۶۷۸۹' english_digits = '0123456789' digit_map = str.maketrans(persian_digits, english_digits) text = text.translate(digit_map) arabic_digits = '٠١٢٣٤٥٦٧٨٩' arabic_map = str.maketrans(arabic_digits, english_digits) text = text.translate(arabic_map) return text.strip() def split_by_punctuation(self, text: str) -> List[str]: segments = re.split(f'({self.sentence_endings})', text) sentences = [] for i in range(0, len(segments) - 1, 2): if i + 1 < len(segments): sentence = segments[i] + segments[i + 1] else: sentence = segments[i] sentence = sentence.strip() if sentence: sentences.append(sentence) if len(segments) % 2 == 1 and segments[-1].strip(): sentences.append(segments[-1].strip()) return sentences def split_long_sentence(self, sentence: str) -> List[str]: if len(sentence) <= self.max_chars: return [sentence] chunks = [] current_chunk = "" parts = re.split(f'({self.weak_boundaries})', sentence) for i in range(0, len(parts)): part = parts[i] if len(current_chunk + part) > self.max_chars and current_chunk: chunks.append(current_chunk.strip()) current_chunk = part else: current_chunk += part if current_chunk.strip(): chunks.append(current_chunk.strip()) final_chunks = [] for chunk in chunks: if len(chunk) > self.max_chars: final_chunks.extend(self.force_split_by_words(chunk)) else: final_chunks.append(chunk) return final_chunks def force_split_by_words(self, text: str) -> List[str]: words = text.split() chunks = [] current_chunk = [] current_length = 0 for word in words: word_length = len(word) + 1 # +1 for space if current_length + word_length > self.max_chars and current_chunk: chunks.append(' '.join(current_chunk)) current_chunk = [word] current_length = word_length else: current_chunk.append(word) current_length += word_length if current_chunk: chunks.append(' '.join(current_chunk)) return chunks def split(self, text: str) -> List[str]: text = self.clean_text(text) if not text: return [] if len(text) <= self.max_chars: return [text] sentences = self.split_by_punctuation(text) final_segments = [] for sentence in sentences: if len(sentence) > self.max_chars: final_segments.extend(self.split_long_sentence(sentence)) else: final_segments.append(sentence) final_segments = [seg.strip() for seg in final_segments if seg.strip()] return final_segments