# document_processor.py import os import glob from tqdm import tqdm import pandas as pd from utils import clean_text, setup_logger logger = setup_logger('document_processor') def split_into_chunks(text, chunk_size=400, overlap=75): """ Split text into overlapping chunks Args: text: The text to split chunk_size: Number of characters per chunk overlap: Number of characters to overlap between chunks """ chunks = [] start = 0 text_length = len(text) while start < text_length: end = start + chunk_size chunk = text[start:end] # Try to break at sentence boundary for better context if end < text_length: # Look for sentence endings last_period = chunk.rfind('.') last_question = chunk.rfind('؟') # Arabic question mark last_exclamation = chunk.rfind('!') last_newline = chunk.rfind('\n') # Find the best break point break_point = max(last_period, last_question, last_exclamation, last_newline) # Only break if we're past halfway through the chunk if break_point > chunk_size * 0.5: chunk = chunk[:break_point + 1] end = start + break_point + 1 chunk = chunk.strip() if chunk: # Only add non-empty chunks chunks.append(chunk) start = end - overlap # Move start with overlap return chunks def load_documents(folder_path, chunk_size=400, overlap=75): """ Load all .txt documents from folder and split them into chunks Args: folder_path: Path to folder containing .txt files chunk_size: Size of each chunk in characters (default: 400) overlap: Overlap between chunks in characters (default: 75) """ documents = [] file_count = 0 txt_files = glob.glob(os.path.join(folder_path, '*.txt')) if not txt_files: logger.warning(f"No .txt files found in {folder_path}") return pd.DataFrame() for file_path in tqdm(txt_files, desc="Loading and chunking documents"): try: with open(file_path, 'r', encoding='utf-8') as file: content = clean_text(file.read()) if not content: logger.warning(f"Empty content in {file_path}") continue # Split into chunks chunks = split_into_chunks(content, chunk_size, overlap) # Create a document entry for each chunk for i, chunk in enumerate(chunks): documents.append({ 'path': file_path, 'chunk_id': i, 'total_chunks': len(chunks), 'content': chunk, 'content_length': len(chunk) }) file_count += 1 logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks") except Exception as e: logger.error(f"Error reading {file_path}: {e}") df = pd.DataFrame(documents) if not df.empty: logger.info(f"Total: {file_count} files → {len(df)} chunks") logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters") return df def load_single_document(file_path, chunk_size=400, overlap=75): """ Load a single document and split it into chunks Args: file_path: Path to the .txt file chunk_size: Size of each chunk in characters overlap: Overlap between chunks in characters """ try: with open(file_path, 'r', encoding='utf-8') as file: content = clean_text(file.read()) if not content: logger.warning(f"Empty content in {file_path}") return pd.DataFrame() # Split into chunks chunks = split_into_chunks(content, chunk_size, overlap) # Create dataframe with chunks documents = [] for i, chunk in enumerate(chunks): documents.append({ 'path': file_path, 'chunk_id': i, 'total_chunks': len(chunks), 'content': chunk, 'content_length': len(chunk) }) logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks") return pd.DataFrame(documents) except Exception as e: logger.error(f"Error reading {file_path}: {e}") return pd.DataFrame()