Spaces:

AI-Driven-Data-Driven
/

Arabic-Rag-Chatbot

Sleeping

File size: 4,830 Bytes

# document_processor.py
import os
import glob
from tqdm import tqdm
import pandas as pd
from utils import clean_text, setup_logger

logger = setup_logger('document_processor')

def split_into_chunks(text, chunk_size=400, overlap=75):
    """
    Split text into overlapping chunks
    
    Args:
        text: The text to split
        chunk_size: Number of characters per chunk
        overlap: Number of characters to overlap between chunks
    """
    chunks = []
    start = 0
    text_length = len(text)
    
    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        
        # Try to break at sentence boundary for better context
        if end < text_length:
            # Look for sentence endings
            last_period = chunk.rfind('.')
            last_question = chunk.rfind('؟')  # Arabic question mark
            last_exclamation = chunk.rfind('!')
            last_newline = chunk.rfind('\n')
            
            # Find the best break point
            break_point = max(last_period, last_question, last_exclamation, last_newline)
            
            # Only break if we're past halfway through the chunk
            if break_point > chunk_size * 0.5:
                chunk = chunk[:break_point + 1]
                end = start + break_point + 1
        
        chunk = chunk.strip()
        if chunk:  # Only add non-empty chunks
            chunks.append(chunk)
        
        start = end - overlap  # Move start with overlap
    
    return chunks

def load_documents(folder_path, chunk_size=400, overlap=75):
    """
    Load all .txt documents from folder and split them into chunks
    
    Args:
        folder_path: Path to folder containing .txt files
        chunk_size: Size of each chunk in characters (default: 400)
        overlap: Overlap between chunks in characters (default: 75)
    """
    documents = []
    file_count = 0
    
    txt_files = glob.glob(os.path.join(folder_path, '*.txt'))
    
    if not txt_files:
        logger.warning(f"No .txt files found in {folder_path}")
        return pd.DataFrame()
    
    for file_path in tqdm(txt_files, desc="Loading and chunking documents"):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = clean_text(file.read())
                
                if not content:
                    logger.warning(f"Empty content in {file_path}")
                    continue
                
                # Split into chunks
                chunks = split_into_chunks(content, chunk_size, overlap)
                
                # Create a document entry for each chunk
                for i, chunk in enumerate(chunks):
                    documents.append({
                        'path': file_path,
                        'chunk_id': i,
                        'total_chunks': len(chunks),
                        'content': chunk,
                        'content_length': len(chunk)
                    })
                
                file_count += 1
                logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
                
        except Exception as e:
            logger.error(f"Error reading {file_path}: {e}")
    
    df = pd.DataFrame(documents)
    
    if not df.empty:
        logger.info(f"Total: {file_count} files → {len(df)} chunks")
        logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters")
    
    return df

def load_single_document(file_path, chunk_size=400, overlap=75):
    """
    Load a single document and split it into chunks
    
    Args:
        file_path: Path to the .txt file
        chunk_size: Size of each chunk in characters
        overlap: Overlap between chunks in characters
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = clean_text(file.read())
            
            if not content:
                logger.warning(f"Empty content in {file_path}")
                return pd.DataFrame()
            
            # Split into chunks
            chunks = split_into_chunks(content, chunk_size, overlap)
            
            # Create dataframe with chunks
            documents = []
            for i, chunk in enumerate(chunks):
                documents.append({
                    'path': file_path,
                    'chunk_id': i,
                    'total_chunks': len(chunks),
                    'content': chunk,
                    'content_length': len(chunk)
                })
            
            logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
            return pd.DataFrame(documents)
            
    except Exception as e:
        logger.error(f"Error reading {file_path}: {e}")
        return pd.DataFrame()