Arabic-Rag-Chatbot / document_processor.py
Ahmed-Alghamdi's picture
Update document_processor.py
fb7084d verified
raw
history blame
4.83 kB
# document_processor.py
import os
import glob
from tqdm import tqdm
import pandas as pd
from utils import clean_text, setup_logger
logger = setup_logger('document_processor')
def split_into_chunks(text, chunk_size=400, overlap=75):
"""
Split text into overlapping chunks
Args:
text: The text to split
chunk_size: Number of characters per chunk
overlap: Number of characters to overlap between chunks
"""
chunks = []
start = 0
text_length = len(text)
while start < text_length:
end = start + chunk_size
chunk = text[start:end]
# Try to break at sentence boundary for better context
if end < text_length:
# Look for sentence endings
last_period = chunk.rfind('.')
last_question = chunk.rfind('؟') # Arabic question mark
last_exclamation = chunk.rfind('!')
last_newline = chunk.rfind('\n')
# Find the best break point
break_point = max(last_period, last_question, last_exclamation, last_newline)
# Only break if we're past halfway through the chunk
if break_point > chunk_size * 0.5:
chunk = chunk[:break_point + 1]
end = start + break_point + 1
chunk = chunk.strip()
if chunk: # Only add non-empty chunks
chunks.append(chunk)
start = end - overlap # Move start with overlap
return chunks
def load_documents(folder_path, chunk_size=400, overlap=75):
"""
Load all .txt documents from folder and split them into chunks
Args:
folder_path: Path to folder containing .txt files
chunk_size: Size of each chunk in characters (default: 400)
overlap: Overlap between chunks in characters (default: 75)
"""
documents = []
file_count = 0
txt_files = glob.glob(os.path.join(folder_path, '*.txt'))
if not txt_files:
logger.warning(f"No .txt files found in {folder_path}")
return pd.DataFrame()
for file_path in tqdm(txt_files, desc="Loading and chunking documents"):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = clean_text(file.read())
if not content:
logger.warning(f"Empty content in {file_path}")
continue
# Split into chunks
chunks = split_into_chunks(content, chunk_size, overlap)
# Create a document entry for each chunk
for i, chunk in enumerate(chunks):
documents.append({
'path': file_path,
'chunk_id': i,
'total_chunks': len(chunks),
'content': chunk,
'content_length': len(chunk)
})
file_count += 1
logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
except Exception as e:
logger.error(f"Error reading {file_path}: {e}")
df = pd.DataFrame(documents)
if not df.empty:
logger.info(f"Total: {file_count} files → {len(df)} chunks")
logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters")
return df
def load_single_document(file_path, chunk_size=400, overlap=75):
"""
Load a single document and split it into chunks
Args:
file_path: Path to the .txt file
chunk_size: Size of each chunk in characters
overlap: Overlap between chunks in characters
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = clean_text(file.read())
if not content:
logger.warning(f"Empty content in {file_path}")
return pd.DataFrame()
# Split into chunks
chunks = split_into_chunks(content, chunk_size, overlap)
# Create dataframe with chunks
documents = []
for i, chunk in enumerate(chunks):
documents.append({
'path': file_path,
'chunk_id': i,
'total_chunks': len(chunks),
'content': chunk,
'content_length': len(chunk)
})
logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
return pd.DataFrame(documents)
except Exception as e:
logger.error(f"Error reading {file_path}: {e}")
return pd.DataFrame()