Spaces:

AI-Driven-Data-Driven
/

Arabic-Rag-Chatbot

Sleeping

App Files Files Community

Ahmed-Alghamdi commited on Nov 5, 2025

Commit

fb7084d

verified ·

1 Parent(s): a7f3645

Update document_processor.py

Browse files

Files changed (1) hide show

document_processor.py +127 -3

document_processor.py CHANGED Viewed

@@ -7,11 +7,135 @@ from utils import clean_text, setup_logger
 logger = setup_logger('document_processor')
-def load_single_document(file_path):
     try:
         with open(file_path, 'r', encoding='utf-8') as file:
             content = clean_text(file.read())
-            return pd.DataFrame([{'path': file_path, 'content': content}])
     except Exception as e:
         logger.error(f"Error reading {file_path}: {e}")
-        return pd.DataFrame()

 logger = setup_logger('document_processor')
+def split_into_chunks(text, chunk_size=400, overlap=75):
+    """
+    Split text into overlapping chunks
+    Args:
+        text: The text to split
+        chunk_size: Number of characters per chunk
+        overlap: Number of characters to overlap between chunks
+    """
+    chunks = []
+    start = 0
+    text_length = len(text)
+    while start < text_length:
+        end = start + chunk_size
+        chunk = text[start:end]
+        # Try to break at sentence boundary for better context
+        if end < text_length:
+            # Look for sentence endings
+            last_period = chunk.rfind('.')
+            last_question = chunk.rfind('؟')  # Arabic question mark
+            last_exclamation = chunk.rfind('!')
+            last_newline = chunk.rfind('\n')
+            # Find the best break point
+            break_point = max(last_period, last_question, last_exclamation, last_newline)
+            # Only break if we're past halfway through the chunk
+            if break_point > chunk_size * 0.5:
+                chunk = chunk[:break_point + 1]
+                end = start + break_point + 1
+        chunk = chunk.strip()
+        if chunk:  # Only add non-empty chunks
+            chunks.append(chunk)
+        start = end - overlap  # Move start with overlap
+    return chunks
+def load_documents(folder_path, chunk_size=400, overlap=75):
+    """
+    Load all .txt documents from folder and split them into chunks
+    Args:
+        folder_path: Path to folder containing .txt files
+        chunk_size: Size of each chunk in characters (default: 400)
+        overlap: Overlap between chunks in characters (default: 75)
+    """
+    documents = []
+    file_count = 0
+    txt_files = glob.glob(os.path.join(folder_path, '*.txt'))
+    if not txt_files:
+        logger.warning(f"No .txt files found in {folder_path}")
+        return pd.DataFrame()
+    for file_path in tqdm(txt_files, desc="Loading and chunking documents"):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                content = clean_text(file.read())
+                if not content:
+                    logger.warning(f"Empty content in {file_path}")
+                    continue
+                # Split into chunks
+                chunks = split_into_chunks(content, chunk_size, overlap)
+                # Create a document entry for each chunk
+                for i, chunk in enumerate(chunks):
+                    documents.append({
+                        'path': file_path,
+                        'chunk_id': i,
+                        'total_chunks': len(chunks),
+                        'content': chunk,
+                        'content_length': len(chunk)
+                    })
+                file_count += 1
+                logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
+        except Exception as e:
+            logger.error(f"Error reading {file_path}: {e}")
+    df = pd.DataFrame(documents)
+    if not df.empty:
+        logger.info(f"Total: {file_count} files → {len(df)} chunks")
+        logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters")
+    return df
+def load_single_document(file_path, chunk_size=400, overlap=75):
+    """
+    Load a single document and split it into chunks
+    Args:
+        file_path: Path to the .txt file
+        chunk_size: Size of each chunk in characters
+        overlap: Overlap between chunks in characters
+    """
     try:
         with open(file_path, 'r', encoding='utf-8') as file:
             content = clean_text(file.read())
+            if not content:
+                logger.warning(f"Empty content in {file_path}")
+                return pd.DataFrame()
+            # Split into chunks
+            chunks = split_into_chunks(content, chunk_size, overlap)
+            # Create dataframe with chunks
+            documents = []
+            for i, chunk in enumerate(chunks):
+                documents.append({
+                    'path': file_path,
+                    'chunk_id': i,
+                    'total_chunks': len(chunks),
+                    'content': chunk,
+                    'content_length': len(chunk)
+                })
+            logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
+            return pd.DataFrame(documents)
     except Exception as e:
         logger.error(f"Error reading {file_path}: {e}")
+        return pd.DataFrame()