| # document_processor.py | |
| import os | |
| import glob | |
| from tqdm import tqdm | |
| import pandas as pd | |
| from utils import clean_text, setup_logger | |
| logger = setup_logger('document_processor') | |
| def load_single_document(file_path): | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| content = clean_text(file.read()) | |
| return pd.DataFrame([{'path': file_path, 'content': content}]) | |
| except Exception as e: | |
| logger.error(f"Error reading {file_path}: {e}") | |
| return pd.DataFrame() | |