File size: 4,679 Bytes
c9622da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""Document loader for various file formats."""

from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional, Dict, Any


@dataclass
class Document:
    """Represents a loaded document."""
    content: str
    metadata: Dict[str, Any] = field(default_factory=dict)
    
    @property
    def source(self) -> str:
        """Get document source path."""
        return self.metadata.get("source", "unknown")


class DocumentLoader:
    """Load documents from various file formats."""
    
    SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"}
    
    def __init__(self):
        """Initialize the document loader."""
        self._pdf_loader = None
        self._docx_loader = None
    
    def load_file(self, file_path: str) -> Document:
        """Load a single file.
        
        Args:
            file_path: Path to the file.
            
        Returns:
            Loaded document.
            
        Raises:
            ValueError: If file format is not supported.
            FileNotFoundError: If file doesn't exist.
        """
        path = Path(file_path)
        
        if not path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        
        extension = path.suffix.lower()
        
        if extension not in self.SUPPORTED_EXTENSIONS:
            raise ValueError(
                f"Unsupported file format: {extension}. "
                f"Supported: {self.SUPPORTED_EXTENSIONS}"
            )
        
        content = self._load_by_extension(path, extension)
        
        return Document(
            content=content,
            metadata={
                "source": str(path.absolute()),
                "filename": path.name,
                "extension": extension
            }
        )
    
    def load_directory(
        self, 
        directory_path: str,
        recursive: bool = True
    ) -> List[Document]:
        """Load all supported files from a directory.
        
        Args:
            directory_path: Path to the directory.
            recursive: Whether to search recursively.
            
        Returns:
            List of loaded documents.
        """
        path = Path(directory_path)
        
        if not path.exists():
            raise FileNotFoundError(f"Directory not found: {directory_path}")
        
        if not path.is_dir():
            raise ValueError(f"Not a directory: {directory_path}")
        
        documents = []
        pattern = "**/*" if recursive else "*"
        
        for file_path in path.glob(pattern):
            if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS:
                try:
                    doc = self.load_file(str(file_path))
                    documents.append(doc)
                    print(f"Loaded: {file_path.name}")
                except Exception as e:
                    print(f"Warning: Failed to load {file_path.name}: {e}")
        
        return documents
    
    def _load_by_extension(self, path: Path, extension: str) -> str:
        """Load file content based on extension.
        
        Args:
            path: File path.
            extension: File extension.
            
        Returns:
            File content as string.
        """
        if extension in {".txt", ".md"}:
            return self._load_text(path)
        elif extension == ".pdf":
            return self._load_pdf(path)
        elif extension == ".docx":
            return self._load_docx(path)
        else:
            raise ValueError(f"Unknown extension: {extension}")
    
    def _load_text(self, path: Path) -> str:
        """Load plain text file."""
        return path.read_text(encoding="utf-8")
    
    def _load_pdf(self, path: Path) -> str:
        """Load PDF file."""
        try:
            from pypdf import PdfReader
        except ImportError:
            raise ImportError("pypdf is required for PDF files: pip install pypdf")
        
        reader = PdfReader(str(path))
        text_parts = []
        
        for page in reader.pages:
            text = page.extract_text()
            if text:
                text_parts.append(text)
        
        return "\n\n".join(text_parts)
    
    def _load_docx(self, path: Path) -> str:
        """Load DOCX file."""
        try:
            from docx import Document as DocxDocument
        except ImportError:
            raise ImportError("python-docx is required for DOCX files: pip install python-docx")
        
        doc = DocxDocument(str(path))
        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
        return "\n\n".join(paragraphs)