| |
| import os |
| import glob |
| from tqdm import tqdm |
| import pandas as pd |
| from utils import clean_text, setup_logger |
|
|
| logger = setup_logger('document_processor') |
|
|
| def split_into_chunks(text, chunk_size=400, overlap=75): |
| """ |
| Split text into overlapping chunks |
| |
| Args: |
| text: The text to split |
| chunk_size: Number of characters per chunk |
| overlap: Number of characters to overlap between chunks |
| """ |
| chunks = [] |
| start = 0 |
| text_length = len(text) |
| |
| while start < text_length: |
| end = start + chunk_size |
| chunk = text[start:end] |
| |
| |
| if end < text_length: |
| |
| last_period = chunk.rfind('.') |
| last_question = chunk.rfind('؟') |
| last_exclamation = chunk.rfind('!') |
| last_newline = chunk.rfind('\n') |
| |
| |
| break_point = max(last_period, last_question, last_exclamation, last_newline) |
| |
| |
| if break_point > chunk_size * 0.5: |
| chunk = chunk[:break_point + 1] |
| end = start + break_point + 1 |
| |
| chunk = chunk.strip() |
| if chunk: |
| chunks.append(chunk) |
| |
| start = end - overlap |
| |
| return chunks |
|
|
| def load_documents(folder_path, chunk_size=400, overlap=75): |
| """ |
| Load all .txt documents from folder and split them into chunks |
| |
| Args: |
| folder_path: Path to folder containing .txt files |
| chunk_size: Size of each chunk in characters (default: 400) |
| overlap: Overlap between chunks in characters (default: 75) |
| """ |
| documents = [] |
| file_count = 0 |
| |
| txt_files = glob.glob(os.path.join(folder_path, '*.txt')) |
| |
| if not txt_files: |
| logger.warning(f"No .txt files found in {folder_path}") |
| return pd.DataFrame() |
| |
| for file_path in tqdm(txt_files, desc="Loading and chunking documents"): |
| try: |
| with open(file_path, 'r', encoding='utf-8') as file: |
| content = clean_text(file.read()) |
| |
| if not content: |
| logger.warning(f"Empty content in {file_path}") |
| continue |
| |
| |
| chunks = split_into_chunks(content, chunk_size, overlap) |
| |
| |
| for i, chunk in enumerate(chunks): |
| documents.append({ |
| 'path': file_path, |
| 'chunk_id': i, |
| 'total_chunks': len(chunks), |
| 'content': chunk, |
| 'content_length': len(chunk) |
| }) |
| |
| file_count += 1 |
| logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks") |
| |
| except Exception as e: |
| logger.error(f"Error reading {file_path}: {e}") |
| |
| df = pd.DataFrame(documents) |
| |
| if not df.empty: |
| logger.info(f"Total: {file_count} files → {len(df)} chunks") |
| logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters") |
| |
| return df |
|
|
| def load_single_document(file_path, chunk_size=400, overlap=75): |
| """ |
| Load a single document and split it into chunks |
| |
| Args: |
| file_path: Path to the .txt file |
| chunk_size: Size of each chunk in characters |
| overlap: Overlap between chunks in characters |
| """ |
| try: |
| with open(file_path, 'r', encoding='utf-8') as file: |
| content = clean_text(file.read()) |
| |
| if not content: |
| logger.warning(f"Empty content in {file_path}") |
| return pd.DataFrame() |
| |
| |
| chunks = split_into_chunks(content, chunk_size, overlap) |
| |
| |
| documents = [] |
| for i, chunk in enumerate(chunks): |
| documents.append({ |
| 'path': file_path, |
| 'chunk_id': i, |
| 'total_chunks': len(chunks), |
| 'content': chunk, |
| 'content_length': len(chunk) |
| }) |
| |
| logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks") |
| return pd.DataFrame(documents) |
| |
| except Exception as e: |
| logger.error(f"Error reading {file_path}: {e}") |
| return pd.DataFrame() |