File size: 5,335 Bytes
bec06d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
from typing import List, Dict, Any
import PyPDF2
import docx2txt
from bs4 import BeautifulSoup
import markdown
import logging
from preprocessor import TextPreprocessor

logger = logging.getLogger(__name__)

class DocumentLoader:
    """
    A utility class to load documents from various formats.
    Supports PDF, DOCX, TXT, and HTML files.
    """
    
    @staticmethod
    def load_pdf(file_path: str) -> str:
        """Load and extract text from a PDF file."""
        try:
            with open(file_path, 'rb') as pdf_file:
                reader = PyPDF2.PdfReader(pdf_file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() + "\n"
                return text
        except Exception as e:
            logger.error(f"Error loading PDF {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def load_docx(file_path: str) -> str:
        """Load and extract text from a DOCX file."""
        try:
            return docx2txt.process(file_path)
        except Exception as e:
            logger.error(f"Error loading DOCX {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def load_txt(file_path: str) -> str:
        """Load and extract text from a TXT file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as txt_file:
                return txt_file.read()
        except Exception as e:
            logger.error(f"Error loading TXT {file_path}: {str(e)}")
            return ""
    
    @staticmethod
    def load_html(file_path: str) -> str:
        """Load and extract text from an HTML file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as html_file:
                soup = BeautifulSoup(html_file, 'html.parser')
                # Remove script and style elements
                for script in soup(["script", "style"]):
                    script.decompose()
                return soup.get_text(separator="\n")
        except Exception as e:
            logger.error(f"Error loading HTML {file_path}: {str(e)}")
            return ""

    @staticmethod
    def load_md(file_path: str) -> str:
        """Load and extract text from a Markdown file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as md_file:
                md_content = md_file.read()
                # Convert Markdown to HTML first, then extract text
                html_content = markdown.markdown(md_content)
                soup = BeautifulSoup(html_content, 'html.parser')
                return soup.get_text(separator="\n")
        except Exception as e:
            logger.error(f"Error loading MD {file_path}: {str(e)}")
            return ""
    
    @classmethod
    def load_document(cls, file_path: str) -> str:
        """Load a document based on its extension and preprocess it."""
        _, ext = os.path.splitext(file_path.lower())

        raw_text = ""
        if ext == '.pdf':
            raw_text = cls.load_pdf(file_path)
        elif ext == '.docx':
            raw_text = cls.load_docx(file_path)
        elif ext == '.txt':
            raw_text = cls.load_txt(file_path)
        elif ext in ['.html', '.htm']:
            raw_text = cls.load_html(file_path)
        elif ext == '.md':
            raw_text = cls.load_md(file_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}")

        # Preprocess the text
        cleaned_text = TextPreprocessor.clean_text(raw_text)
        return cleaned_text
    
    @classmethod
    def load_documents_from_directory(cls, directory_path: str, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]:
        """Load all supported documents from a directory, with optional chunking."""
        documents = []

        for root, dirs, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                _, ext = os.path.splitext(file.lower())

                if ext in ['.pdf', '.docx', '.txt', '.html', '.htm', '.md']:
                    content = cls.load_document(file_path)

                    if content.strip():  # Only add non-empty documents
                        # If the content is too long, chunk it
                        if len(content) > chunk_size:
                            chunks = TextPreprocessor.chunk_text(content, chunk_size, overlap)
                            for i, chunk in enumerate(chunks):
                                documents.append({
                                    'content': chunk,
                                    'source': file_path,
                                    'metadata': {
                                        'file_name': file,
                                        'file_path': file_path,
                                        'chunk_id': i,
                                        'total_chunks': len(chunks)
                                    }
                                })
                        else:
                            documents.append({
                                'content': content,
                                'source': file_path,
                                'metadata': {'file_name': file, 'file_path': file_path}
                            })

        return documents