Spaces:

Danielrahmai1991
/

dataset_interface

Sleeping

App Files Files Community

Danielrahmai1991 commited on Mar 10, 2025

Commit

3c59de5

verified ·

1 Parent(s): fb74abb

Update preprocessing.py

Browse files

Files changed (1) hide show

preprocessing.py +45 -0

preprocessing.py CHANGED Viewed

@@ -5,6 +5,51 @@ import re
 from hazm import Normalizer
 import pypdf
 def is_meaningful(text):
     """

 from hazm import Normalizer
 import pypdf
+from nltk.tokenize import sent_tokenize
+from hazm import SentenceTokenizer  # For Persian sentence tokenization
+def smart_chunking(text, max_tokens=1024, tokenizer=None):
+    """
+    Splits the text into meaningful chunks using sentence boundaries.
+    Ensures that each chunk does not exceed the maximum token limit.
+    Supports both Persian and English text.
+    """
+    # Step 1: Split text into sentences
+    if any(lang_char in text for lang_char in "ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی"):  # Check for Persian characters
+        # Use hazm for Persian sentence tokenization
+        persian_sent_tokenizer = SentenceTokenizer()
+        sentences = persian_sent_tokenizer.tokenize(text)
+    else:
+        # Use NLTK for English sentence tokenization
+        sentences = sent_tokenize(text)
+    # Step 2: Initialize variables
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    # Step 3: Add sentences to chunks
+    for sentence in sentences:
+        # Tokenize the sentence to estimate its length
+        sentence_tokens = tokenizer.encode(sentence) if tokenizer else sentence.split()
+        sentence_length = len(sentence_tokens)
+        # If adding the sentence exceeds the max length, start a new chunk
+        if current_length + sentence_length > max_tokens:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = []
+            current_length = 0
+        # Add the sentence to the current chunk
+        current_chunk.append(sentence)
+        current_length += sentence_length
+    # Add any remaining sentences as the last chunk
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks
 def is_meaningful(text):
     """