Spaces:
Sleeping
Sleeping
Update preprocessing.py
Browse files- preprocessing.py +45 -0
preprocessing.py
CHANGED
|
@@ -5,6 +5,51 @@ import re
|
|
| 5 |
from hazm import Normalizer
|
| 6 |
import pypdf
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def is_meaningful(text):
|
| 10 |
"""
|
|
|
|
| 5 |
from hazm import Normalizer
|
| 6 |
import pypdf
|
| 7 |
|
| 8 |
+
from nltk.tokenize import sent_tokenize
|
| 9 |
+
from hazm import SentenceTokenizer # For Persian sentence tokenization
|
| 10 |
+
|
| 11 |
+
def smart_chunking(text, max_tokens=1024, tokenizer=None):
|
| 12 |
+
"""
|
| 13 |
+
Splits the text into meaningful chunks using sentence boundaries.
|
| 14 |
+
Ensures that each chunk does not exceed the maximum token limit.
|
| 15 |
+
Supports both Persian and English text.
|
| 16 |
+
"""
|
| 17 |
+
# Step 1: Split text into sentences
|
| 18 |
+
if any(lang_char in text for lang_char in "ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"): # Check for Persian characters
|
| 19 |
+
# Use hazm for Persian sentence tokenization
|
| 20 |
+
persian_sent_tokenizer = SentenceTokenizer()
|
| 21 |
+
sentences = persian_sent_tokenizer.tokenize(text)
|
| 22 |
+
else:
|
| 23 |
+
# Use NLTK for English sentence tokenization
|
| 24 |
+
sentences = sent_tokenize(text)
|
| 25 |
+
|
| 26 |
+
# Step 2: Initialize variables
|
| 27 |
+
chunks = []
|
| 28 |
+
current_chunk = []
|
| 29 |
+
current_length = 0
|
| 30 |
+
|
| 31 |
+
# Step 3: Add sentences to chunks
|
| 32 |
+
for sentence in sentences:
|
| 33 |
+
# Tokenize the sentence to estimate its length
|
| 34 |
+
sentence_tokens = tokenizer.encode(sentence) if tokenizer else sentence.split()
|
| 35 |
+
sentence_length = len(sentence_tokens)
|
| 36 |
+
|
| 37 |
+
# If adding the sentence exceeds the max length, start a new chunk
|
| 38 |
+
if current_length + sentence_length > max_tokens:
|
| 39 |
+
chunks.append(" ".join(current_chunk))
|
| 40 |
+
current_chunk = []
|
| 41 |
+
current_length = 0
|
| 42 |
+
|
| 43 |
+
# Add the sentence to the current chunk
|
| 44 |
+
current_chunk.append(sentence)
|
| 45 |
+
current_length += sentence_length
|
| 46 |
+
|
| 47 |
+
# Add any remaining sentences as the last chunk
|
| 48 |
+
if current_chunk:
|
| 49 |
+
chunks.append(" ".join(current_chunk))
|
| 50 |
+
|
| 51 |
+
return chunks
|
| 52 |
+
|
| 53 |
|
| 54 |
def is_meaningful(text):
|
| 55 |
"""
|