File size: 4,679 Bytes
c9622da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
"""Document loader for various file formats."""
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional, Dict, Any
@dataclass
class Document:
"""Represents a loaded document."""
content: str
metadata: Dict[str, Any] = field(default_factory=dict)
@property
def source(self) -> str:
"""Get document source path."""
return self.metadata.get("source", "unknown")
class DocumentLoader:
"""Load documents from various file formats."""
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"}
def __init__(self):
"""Initialize the document loader."""
self._pdf_loader = None
self._docx_loader = None
def load_file(self, file_path: str) -> Document:
"""Load a single file.
Args:
file_path: Path to the file.
Returns:
Loaded document.
Raises:
ValueError: If file format is not supported.
FileNotFoundError: If file doesn't exist.
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
extension = path.suffix.lower()
if extension not in self.SUPPORTED_EXTENSIONS:
raise ValueError(
f"Unsupported file format: {extension}. "
f"Supported: {self.SUPPORTED_EXTENSIONS}"
)
content = self._load_by_extension(path, extension)
return Document(
content=content,
metadata={
"source": str(path.absolute()),
"filename": path.name,
"extension": extension
}
)
def load_directory(
self,
directory_path: str,
recursive: bool = True
) -> List[Document]:
"""Load all supported files from a directory.
Args:
directory_path: Path to the directory.
recursive: Whether to search recursively.
Returns:
List of loaded documents.
"""
path = Path(directory_path)
if not path.exists():
raise FileNotFoundError(f"Directory not found: {directory_path}")
if not path.is_dir():
raise ValueError(f"Not a directory: {directory_path}")
documents = []
pattern = "**/*" if recursive else "*"
for file_path in path.glob(pattern):
if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS:
try:
doc = self.load_file(str(file_path))
documents.append(doc)
print(f"Loaded: {file_path.name}")
except Exception as e:
print(f"Warning: Failed to load {file_path.name}: {e}")
return documents
def _load_by_extension(self, path: Path, extension: str) -> str:
"""Load file content based on extension.
Args:
path: File path.
extension: File extension.
Returns:
File content as string.
"""
if extension in {".txt", ".md"}:
return self._load_text(path)
elif extension == ".pdf":
return self._load_pdf(path)
elif extension == ".docx":
return self._load_docx(path)
else:
raise ValueError(f"Unknown extension: {extension}")
def _load_text(self, path: Path) -> str:
"""Load plain text file."""
return path.read_text(encoding="utf-8")
def _load_pdf(self, path: Path) -> str:
"""Load PDF file."""
try:
from pypdf import PdfReader
except ImportError:
raise ImportError("pypdf is required for PDF files: pip install pypdf")
reader = PdfReader(str(path))
text_parts = []
for page in reader.pages:
text = page.extract_text()
if text:
text_parts.append(text)
return "\n\n".join(text_parts)
def _load_docx(self, path: Path) -> str:
"""Load DOCX file."""
try:
from docx import Document as DocxDocument
except ImportError:
raise ImportError("python-docx is required for DOCX files: pip install python-docx")
doc = DocxDocument(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return "\n\n".join(paragraphs)
|