Spaces:

AI-Driven-Data-Driven
/

Arabic-Rag-Chatbot

Sleeping

App Files Files Community

Ahmed-Alghamdi commited on Dec 30, 2025

Commit

c19463a

verified ·

1 Parent(s): 5b8e218

Update document_processor.py

Browse files

Files changed (1) hide show

document_processor.py +16 -19

document_processor.py CHANGED Viewed

@@ -1,20 +1,18 @@
-# document_processor.py
 import os
-import glob
-from tqdm import tqdm
 import pandas as pd
 from utils import clean_text, setup_logger
 logger = setup_logger('document_processor')
-def split_into_chunks(text, chunk_size=400, overlap=75):
     """
     Split text into overlapping chunks
     Args:
         text: The text to split
-        chunk_size: Number of characters per chunk
-        overlap: Number of characters to overlap between chunks
     """
     chunks = []
     start = 0
@@ -36,6 +34,7 @@ def split_into_chunks(text, chunk_size=400, overlap=75):
             break_point = max(last_period, last_question, last_exclamation, last_newline)
             # Only break if we're past halfway through the chunk
             if break_point > chunk_size * 0.5:
                 chunk = chunk[:break_point + 1]
                 end = start + break_point + 1
@@ -44,25 +43,23 @@ def split_into_chunks(text, chunk_size=400, overlap=75):
         if chunk:  # Only add non-empty chunks
             chunks.append(chunk)
-        start = end - overlap  # Move start with overlap
     return chunks
-    if not df.empty:
-        logger.info(f"Total: {file_count} files → {len(df)} chunks")
-        logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters")
-    return df
-def load_single_document(file_path, chunk_size=400, overlap=75):
     """
     Load a single document and split it into chunks
     Args:
         file_path: Path to the .txt file
-        chunk_size: Size of each chunk in characters
-        overlap: Overlap between chunks in characters
     """
     try:
         with open(file_path, 'r', encoding='utf-8') as file:
@@ -72,7 +69,7 @@ def load_single_document(file_path, chunk_size=400, overlap=75):
                 logger.warning(f"Empty content in {file_path}")
                 return pd.DataFrame()
-            # Split into chunks
             chunks = split_into_chunks(content, chunk_size, overlap)
             # Create dataframe with chunks

 import os
 import pandas as pd
 from utils import clean_text, setup_logger
 logger = setup_logger('document_processor')
+# تم تعديل القيم الافتراضية هنا لتناسب النصوص الطويلة
+def split_into_chunks(text, chunk_size=1000, overlap=200):
     """
     Split text into overlapping chunks
     Args:
         text: The text to split
+        chunk_size: Number of characters per chunk (Zidnah to 1000)
+        overlap: Number of characters to overlap (Zidnah to 200)
     """
     chunks = []
     start = 0
             break_point = max(last_period, last_question, last_exclamation, last_newline)
             # Only break if we're past halfway through the chunk
+            # This ensures we don't create very small chunks
             if break_point > chunk_size * 0.5:
                 chunk = chunk[:break_point + 1]
                 end = start + break_point + 1
         if chunk:  # Only add non-empty chunks
             chunks.append(chunk)
+        # Move start pointer, ensuring we overlap
+        # If we reached the end of text, break to avoid infinite loop
+        if start >= end - overlap:
+            start = end
+        else:
+            start = end - overlap
     return chunks
+def load_single_document(file_path, chunk_size=1000, overlap=200):
     """
     Load a single document and split it into chunks
     Args:
         file_path: Path to the .txt file
+        chunk_size: Size of each chunk in characters (Default: 1000)
+        overlap: Overlap between chunks in characters (Default: 200)
     """
     try:
         with open(file_path, 'r', encoding='utf-8') as file:
                 logger.warning(f"Empty content in {file_path}")
                 return pd.DataFrame()
+            # Split into chunks using the new sizes
             chunks = split_into_chunks(content, chunk_size, overlap)
             # Create dataframe with chunks