Spaces:
Configuration error
Configuration error
| import os | |
| from typing import List, Dict, Any | |
| import PyPDF2 | |
| import docx2txt | |
| from bs4 import BeautifulSoup | |
| import markdown | |
| import logging | |
| from preprocessor import TextPreprocessor | |
| logger = logging.getLogger(__name__) | |
| class DocumentLoader: | |
| """ | |
| A utility class to load documents from various formats. | |
| Supports PDF, DOCX, TXT, and HTML files. | |
| """ | |
| def load_pdf(file_path: str) -> str: | |
| """Load and extract text from a PDF file.""" | |
| try: | |
| with open(file_path, 'rb') as pdf_file: | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| except Exception as e: | |
| logger.error(f"Error loading PDF {file_path}: {str(e)}") | |
| return "" | |
| def load_docx(file_path: str) -> str: | |
| """Load and extract text from a DOCX file.""" | |
| try: | |
| return docx2txt.process(file_path) | |
| except Exception as e: | |
| logger.error(f"Error loading DOCX {file_path}: {str(e)}") | |
| return "" | |
| def load_txt(file_path: str) -> str: | |
| """Load and extract text from a TXT file.""" | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as txt_file: | |
| return txt_file.read() | |
| except Exception as e: | |
| logger.error(f"Error loading TXT {file_path}: {str(e)}") | |
| return "" | |
| def load_html(file_path: str) -> str: | |
| """Load and extract text from an HTML file.""" | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as html_file: | |
| soup = BeautifulSoup(html_file, 'html.parser') | |
| # Remove script and style elements | |
| for script in soup(["script", "style"]): | |
| script.decompose() | |
| return soup.get_text(separator="\n") | |
| except Exception as e: | |
| logger.error(f"Error loading HTML {file_path}: {str(e)}") | |
| return "" | |
| def load_md(file_path: str) -> str: | |
| """Load and extract text from a Markdown file.""" | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as md_file: | |
| md_content = md_file.read() | |
| # Convert Markdown to HTML first, then extract text | |
| html_content = markdown.markdown(md_content) | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| return soup.get_text(separator="\n") | |
| except Exception as e: | |
| logger.error(f"Error loading MD {file_path}: {str(e)}") | |
| return "" | |
| def load_document(cls, file_path: str) -> str: | |
| """Load a document based on its extension and preprocess it.""" | |
| _, ext = os.path.splitext(file_path.lower()) | |
| raw_text = "" | |
| if ext == '.pdf': | |
| raw_text = cls.load_pdf(file_path) | |
| elif ext == '.docx': | |
| raw_text = cls.load_docx(file_path) | |
| elif ext == '.txt': | |
| raw_text = cls.load_txt(file_path) | |
| elif ext in ['.html', '.htm']: | |
| raw_text = cls.load_html(file_path) | |
| elif ext == '.md': | |
| raw_text = cls.load_md(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file format: {ext}") | |
| # Preprocess the text | |
| cleaned_text = TextPreprocessor.clean_text(raw_text) | |
| return cleaned_text | |
| def load_documents_from_directory(cls, directory_path: str, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]: | |
| """Load all supported documents from a directory, with optional chunking.""" | |
| documents = [] | |
| for root, dirs, files in os.walk(directory_path): | |
| for file in files: | |
| file_path = os.path.join(root, file) | |
| _, ext = os.path.splitext(file.lower()) | |
| if ext in ['.pdf', '.docx', '.txt', '.html', '.htm', '.md']: | |
| content = cls.load_document(file_path) | |
| if content.strip(): # Only add non-empty documents | |
| # If the content is too long, chunk it | |
| if len(content) > chunk_size: | |
| chunks = TextPreprocessor.chunk_text(content, chunk_size, overlap) | |
| for i, chunk in enumerate(chunks): | |
| documents.append({ | |
| 'content': chunk, | |
| 'source': file_path, | |
| 'metadata': { | |
| 'file_name': file, | |
| 'file_path': file_path, | |
| 'chunk_id': i, | |
| 'total_chunks': len(chunks) | |
| } | |
| }) | |
| else: | |
| documents.append({ | |
| 'content': content, | |
| 'source': file_path, | |
| 'metadata': {'file_name': file, 'file_path': file_path} | |
| }) | |
| return documents |