import os from typing import List, Dict, Any import PyPDF2 import docx2txt from bs4 import BeautifulSoup import markdown import logging from preprocessor import TextPreprocessor logger = logging.getLogger(__name__) class DocumentLoader: """ A utility class to load documents from various formats. Supports PDF, DOCX, TXT, and HTML files. """ @staticmethod def load_pdf(file_path: str) -> str: """Load and extract text from a PDF file.""" try: with open(file_path, 'rb') as pdf_file: reader = PyPDF2.PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text except Exception as e: logger.error(f"Error loading PDF {file_path}: {str(e)}") return "" @staticmethod def load_docx(file_path: str) -> str: """Load and extract text from a DOCX file.""" try: return docx2txt.process(file_path) except Exception as e: logger.error(f"Error loading DOCX {file_path}: {str(e)}") return "" @staticmethod def load_txt(file_path: str) -> str: """Load and extract text from a TXT file.""" try: with open(file_path, 'r', encoding='utf-8') as txt_file: return txt_file.read() except Exception as e: logger.error(f"Error loading TXT {file_path}: {str(e)}") return "" @staticmethod def load_html(file_path: str) -> str: """Load and extract text from an HTML file.""" try: with open(file_path, 'r', encoding='utf-8') as html_file: soup = BeautifulSoup(html_file, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() return soup.get_text(separator="\n") except Exception as e: logger.error(f"Error loading HTML {file_path}: {str(e)}") return "" @staticmethod def load_md(file_path: str) -> str: """Load and extract text from a Markdown file.""" try: with open(file_path, 'r', encoding='utf-8') as md_file: md_content = md_file.read() # Convert Markdown to HTML first, then extract text html_content = markdown.markdown(md_content) soup = BeautifulSoup(html_content, 'html.parser') return soup.get_text(separator="\n") except Exception as e: logger.error(f"Error loading MD {file_path}: {str(e)}") return "" @classmethod def load_document(cls, file_path: str) -> str: """Load a document based on its extension and preprocess it.""" _, ext = os.path.splitext(file_path.lower()) raw_text = "" if ext == '.pdf': raw_text = cls.load_pdf(file_path) elif ext == '.docx': raw_text = cls.load_docx(file_path) elif ext == '.txt': raw_text = cls.load_txt(file_path) elif ext in ['.html', '.htm']: raw_text = cls.load_html(file_path) elif ext == '.md': raw_text = cls.load_md(file_path) else: raise ValueError(f"Unsupported file format: {ext}") # Preprocess the text cleaned_text = TextPreprocessor.clean_text(raw_text) return cleaned_text @classmethod def load_documents_from_directory(cls, directory_path: str, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]: """Load all supported documents from a directory, with optional chunking.""" documents = [] for root, dirs, files in os.walk(directory_path): for file in files: file_path = os.path.join(root, file) _, ext = os.path.splitext(file.lower()) if ext in ['.pdf', '.docx', '.txt', '.html', '.htm', '.md']: content = cls.load_document(file_path) if content.strip(): # Only add non-empty documents # If the content is too long, chunk it if len(content) > chunk_size: chunks = TextPreprocessor.chunk_text(content, chunk_size, overlap) for i, chunk in enumerate(chunks): documents.append({ 'content': chunk, 'source': file_path, 'metadata': { 'file_name': file, 'file_path': file_path, 'chunk_id': i, 'total_chunks': len(chunks) } }) else: documents.append({ 'content': content, 'source': file_path, 'metadata': {'file_name': file, 'file_path': file_path} }) return documents