File size: 3,634 Bytes
da2ee9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import re
from typing import List
class PersianSentenceSplitter:
def __init__(self, max_chars: int = 200, min_chars: int = 50):
self.max_chars = max_chars
self.min_chars = min_chars
self.sentence_endings = r'[.!?؟۔]'
self.weak_boundaries = r'[،,;؛]'
def clean_text(self, text: str) -> str:
text = re.sub(r'\s+', ' ', text)
text = text.replace('_', '\u200c')
text = text.replace('ك', 'ک').replace('ي', 'ی')
persian_digits = '۰۱۲۳۴۵۶۷۸۹'
english_digits = '0123456789'
digit_map = str.maketrans(persian_digits, english_digits)
text = text.translate(digit_map)
arabic_digits = '٠١٢٣٤٥٦٧٨٩'
arabic_map = str.maketrans(arabic_digits, english_digits)
text = text.translate(arabic_map)
return text.strip()
def split_by_punctuation(self, text: str) -> List[str]:
segments = re.split(f'({self.sentence_endings})', text)
sentences = []
for i in range(0, len(segments) - 1, 2):
if i + 1 < len(segments):
sentence = segments[i] + segments[i + 1]
else:
sentence = segments[i]
sentence = sentence.strip()
if sentence:
sentences.append(sentence)
if len(segments) % 2 == 1 and segments[-1].strip():
sentences.append(segments[-1].strip())
return sentences
def split_long_sentence(self, sentence: str) -> List[str]:
if len(sentence) <= self.max_chars:
return [sentence]
chunks = []
current_chunk = ""
parts = re.split(f'({self.weak_boundaries})', sentence)
for i in range(0, len(parts)):
part = parts[i]
if len(current_chunk + part) > self.max_chars and current_chunk:
chunks.append(current_chunk.strip())
current_chunk = part
else:
current_chunk += part
if current_chunk.strip():
chunks.append(current_chunk.strip())
final_chunks = []
for chunk in chunks:
if len(chunk) > self.max_chars:
final_chunks.extend(self.force_split_by_words(chunk))
else:
final_chunks.append(chunk)
return final_chunks
def force_split_by_words(self, text: str) -> List[str]:
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
word_length = len(word) + 1 # +1 for space
if current_length + word_length > self.max_chars and current_chunk:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = word_length
else:
current_chunk.append(word)
current_length += word_length
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def split(self, text: str) -> List[str]:
text = self.clean_text(text)
if not text:
return []
if len(text) <= self.max_chars:
return [text]
sentences = self.split_by_punctuation(text)
final_segments = []
for sentence in sentences:
if len(sentence) > self.max_chars:
final_segments.extend(self.split_long_sentence(sentence))
else:
final_segments.append(sentence)
final_segments = [seg.strip() for seg in final_segments if seg.strip()]
return final_segments
|