File size: 4,830 Bytes
e820a8a fb7084d d12e375 fb7084d d12e375 fb7084d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | # document_processor.py
import os
import glob
from tqdm import tqdm
import pandas as pd
from utils import clean_text, setup_logger
logger = setup_logger('document_processor')
def split_into_chunks(text, chunk_size=400, overlap=75):
"""
Split text into overlapping chunks
Args:
text: The text to split
chunk_size: Number of characters per chunk
overlap: Number of characters to overlap between chunks
"""
chunks = []
start = 0
text_length = len(text)
while start < text_length:
end = start + chunk_size
chunk = text[start:end]
# Try to break at sentence boundary for better context
if end < text_length:
# Look for sentence endings
last_period = chunk.rfind('.')
last_question = chunk.rfind('؟') # Arabic question mark
last_exclamation = chunk.rfind('!')
last_newline = chunk.rfind('\n')
# Find the best break point
break_point = max(last_period, last_question, last_exclamation, last_newline)
# Only break if we're past halfway through the chunk
if break_point > chunk_size * 0.5:
chunk = chunk[:break_point + 1]
end = start + break_point + 1
chunk = chunk.strip()
if chunk: # Only add non-empty chunks
chunks.append(chunk)
start = end - overlap # Move start with overlap
return chunks
def load_documents(folder_path, chunk_size=400, overlap=75):
"""
Load all .txt documents from folder and split them into chunks
Args:
folder_path: Path to folder containing .txt files
chunk_size: Size of each chunk in characters (default: 400)
overlap: Overlap between chunks in characters (default: 75)
"""
documents = []
file_count = 0
txt_files = glob.glob(os.path.join(folder_path, '*.txt'))
if not txt_files:
logger.warning(f"No .txt files found in {folder_path}")
return pd.DataFrame()
for file_path in tqdm(txt_files, desc="Loading and chunking documents"):
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = clean_text(file.read())
if not content:
logger.warning(f"Empty content in {file_path}")
continue
# Split into chunks
chunks = split_into_chunks(content, chunk_size, overlap)
# Create a document entry for each chunk
for i, chunk in enumerate(chunks):
documents.append({
'path': file_path,
'chunk_id': i,
'total_chunks': len(chunks),
'content': chunk,
'content_length': len(chunk)
})
file_count += 1
logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
except Exception as e:
logger.error(f"Error reading {file_path}: {e}")
df = pd.DataFrame(documents)
if not df.empty:
logger.info(f"Total: {file_count} files → {len(df)} chunks")
logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters")
return df
def load_single_document(file_path, chunk_size=400, overlap=75):
"""
Load a single document and split it into chunks
Args:
file_path: Path to the .txt file
chunk_size: Size of each chunk in characters
overlap: Overlap between chunks in characters
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = clean_text(file.read())
if not content:
logger.warning(f"Empty content in {file_path}")
return pd.DataFrame()
# Split into chunks
chunks = split_into_chunks(content, chunk_size, overlap)
# Create dataframe with chunks
documents = []
for i, chunk in enumerate(chunks):
documents.append({
'path': file_path,
'chunk_id': i,
'total_chunks': len(chunks),
'content': chunk,
'content_length': len(chunk)
})
logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
return pd.DataFrame(documents)
except Exception as e:
logger.error(f"Error reading {file_path}: {e}")
return pd.DataFrame() |