Spaces:

AI-Driven-Data-Driven
/

Arabic-Rag-Chatbot

Sleeping

App Files Files Community

Arabic-Rag-Chatbot / document_processor.py

Ahmed-Alghamdi

Update document_processor.py

fb7084d verified 5 months ago

raw

history blame

4.83 kB

	# document_processor.py
	import os
	import glob
	from tqdm import tqdm
	import pandas as pd
	from utils import clean_text, setup_logger

	logger = setup_logger('document_processor')

	def split_into_chunks(text, chunk_size=400, overlap=75):
	"""
	Split text into overlapping chunks

	Args:
	text: The text to split
	chunk_size: Number of characters per chunk
	overlap: Number of characters to overlap between chunks
	"""
	chunks = []
	start = 0
	text_length = len(text)

	while start < text_length:
	end = start + chunk_size
	chunk = text[start:end]

	# Try to break at sentence boundary for better context
	if end < text_length:
	# Look for sentence endings
	last_period = chunk.rfind('.')
	last_question = chunk.rfind('؟') # Arabic question mark
	last_exclamation = chunk.rfind('!')
	last_newline = chunk.rfind('\n')

	# Find the best break point
	break_point = max(last_period, last_question, last_exclamation, last_newline)

	# Only break if we're past halfway through the chunk
	if break_point > chunk_size * 0.5:
	chunk = chunk[:break_point + 1]
	end = start + break_point + 1

	chunk = chunk.strip()
	if chunk: # Only add non-empty chunks
	chunks.append(chunk)

	start = end - overlap # Move start with overlap

	return chunks

	def load_documents(folder_path, chunk_size=400, overlap=75):
	"""
	Load all .txt documents from folder and split them into chunks

	Args:
	folder_path: Path to folder containing .txt files
	chunk_size: Size of each chunk in characters (default: 400)
	overlap: Overlap between chunks in characters (default: 75)
	"""
	documents = []
	file_count = 0

	txt_files = glob.glob(os.path.join(folder_path, '*.txt'))

	if not txt_files:
	logger.warning(f"No .txt files found in {folder_path}")
	return pd.DataFrame()

	for file_path in tqdm(txt_files, desc="Loading and chunking documents"):
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	content = clean_text(file.read())

	if not content:
	logger.warning(f"Empty content in {file_path}")
	continue

	# Split into chunks
	chunks = split_into_chunks(content, chunk_size, overlap)

	# Create a document entry for each chunk
	for i, chunk in enumerate(chunks):
	documents.append({
	'path': file_path,
	'chunk_id': i,
	'total_chunks': len(chunks),
	'content': chunk,
	'content_length': len(chunk)
	})

	file_count += 1
	logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")

	except Exception as e:
	logger.error(f"Error reading {file_path}: {e}")

	df = pd.DataFrame(documents)

	if not df.empty:
	logger.info(f"Total: {file_count} files → {len(df)} chunks")
	logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters")

	return df

	def load_single_document(file_path, chunk_size=400, overlap=75):
	"""
	Load a single document and split it into chunks

	Args:
	file_path: Path to the .txt file
	chunk_size: Size of each chunk in characters
	overlap: Overlap between chunks in characters
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	content = clean_text(file.read())

	if not content:
	logger.warning(f"Empty content in {file_path}")
	return pd.DataFrame()

	# Split into chunks
	chunks = split_into_chunks(content, chunk_size, overlap)

	# Create dataframe with chunks
	documents = []
	for i, chunk in enumerate(chunks):
	documents.append({
	'path': file_path,
	'chunk_id': i,
	'total_chunks': len(chunks),
	'content': chunk,
	'content_length': len(chunk)
	})

	logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
	return pd.DataFrame(documents)

	except Exception as e:
	logger.error(f"Error reading {file_path}: {e}")
	return pd.DataFrame()