Spaces:

muhammadshaheryar
/

Docker_Deploy

Configuration error

File size: 5,335 Bytes

bec06d9

import os
from typing import List, Dict, Any
import PyPDF2
import docx2txt
from bs4 import BeautifulSoup
import markdown
import logging
from preprocessor import TextPreprocessor

logger = logging.getLogger(__name__)

class DocumentLoader:
    """
    A utility class to load documents from various formats.
    Supports PDF, DOCX, TXT, and HTML files.
    """
    
    @staticmethod
    def load_pdf(file_path: str) -> str:
        """Load and extract text from a PDF file."""
        try:
            with open(file_path, 'rb') as pdf_file:
                reader = PyPDF2.PdfReader(pdf_file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() + "\n"
                return text
        except Exception as e:
            logger.error(f"Error loading PDF {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def load_docx(file_path: str) -> str:
        """Load and extract text from a DOCX file."""
        try:
            return docx2txt.process(file_path)
        except Exception as e:
            logger.error(f"Error loading DOCX {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def load_txt(file_path: str) -> str:
        """Load and extract text from a TXT file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as txt_file:
                return txt_file.read()
        except Exception as e:
            logger.error(f"Error loading TXT {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def load_html(file_path: str) -> str:
        """Load and extract text from an HTML file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as html_file:
                soup = BeautifulSoup(html_file, 'html.parser')
                # Remove script and style elements
                for script in soup(["script", "style"]):
                    script.decompose()
                return soup.get_text(separator="\n")
        except Exception as e:
            logger.error(f"Error loading HTML {file_path}: {str(e)}")
            return ""

    @staticmethod
    def load_md(file_path: str) -> str:
        """Load and extract text from a Markdown file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as md_file:
                md_content = md_file.read()
                # Convert Markdown to HTML first, then extract text
                html_content = markdown.markdown(md_content)
                soup = BeautifulSoup(html_content, 'html.parser')
                return soup.get_text(separator="\n")
        except Exception as e:
            logger.error(f"Error loading MD {file_path}: {str(e)}")
            return ""
    
    @classmethod
    def load_document(cls, file_path: str) -> str:
        """Load a document based on its extension and preprocess it."""
        _, ext = os.path.splitext(file_path.lower())

        raw_text = ""
        if ext == '.pdf':
            raw_text = cls.load_pdf(file_path)
        elif ext == '.docx':
            raw_text = cls.load_docx(file_path)
        elif ext == '.txt':
            raw_text = cls.load_txt(file_path)
        elif ext in ['.html', '.htm']:
            raw_text = cls.load_html(file_path)
        elif ext == '.md':
            raw_text = cls.load_md(file_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}")

        # Preprocess the text
        cleaned_text = TextPreprocessor.clean_text(raw_text)
        return cleaned_text
    
    @classmethod
    def load_documents_from_directory(cls, directory_path: str, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]:
        """Load all supported documents from a directory, with optional chunking."""
        documents = []

        for root, dirs, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                _, ext = os.path.splitext(file.lower())

                if ext in ['.pdf', '.docx', '.txt', '.html', '.htm', '.md']:
                    content = cls.load_document(file_path)

                    if content.strip():  # Only add non-empty documents
                        # If the content is too long, chunk it
                        if len(content) > chunk_size:
                            chunks = TextPreprocessor.chunk_text(content, chunk_size, overlap)
                            for i, chunk in enumerate(chunks):
                                documents.append({
                                    'content': chunk,
                                    'source': file_path,
                                    'metadata': {
                                        'file_name': file,
                                        'file_path': file_path,
                                        'chunk_id': i,
                                        'total_chunks': len(chunks)
                                    }
                                })
                        else:
                            documents.append({
                                'content': content,
                                'source': file_path,
                                'metadata': {'file_name': file, 'file_path': file_path}
                            })

        return documents