File size: 518 Bytes
e820a8a
 
 
 
 
 
 
 
 
d12e375
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# document_processor.py
import os
import glob
from tqdm import tqdm
import pandas as pd
from utils import clean_text, setup_logger

logger = setup_logger('document_processor')

def load_single_document(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = clean_text(file.read())
            return pd.DataFrame([{'path': file_path, 'content': content}])
    except Exception as e:
        logger.error(f"Error reading {file_path}: {e}")
        return pd.DataFrame()