GitHub Actions
Deploy from GitHub Actions
c9622da
"""Document loader for various file formats."""
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional, Dict, Any
@dataclass
class Document:
"""Represents a loaded document."""
content: str
metadata: Dict[str, Any] = field(default_factory=dict)
@property
def source(self) -> str:
"""Get document source path."""
return self.metadata.get("source", "unknown")
class DocumentLoader:
"""Load documents from various file formats."""
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"}
def __init__(self):
"""Initialize the document loader."""
self._pdf_loader = None
self._docx_loader = None
def load_file(self, file_path: str) -> Document:
"""Load a single file.
Args:
file_path: Path to the file.
Returns:
Loaded document.
Raises:
ValueError: If file format is not supported.
FileNotFoundError: If file doesn't exist.
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
extension = path.suffix.lower()
if extension not in self.SUPPORTED_EXTENSIONS:
raise ValueError(
f"Unsupported file format: {extension}. "
f"Supported: {self.SUPPORTED_EXTENSIONS}"
)
content = self._load_by_extension(path, extension)
return Document(
content=content,
metadata={
"source": str(path.absolute()),
"filename": path.name,
"extension": extension
}
)
def load_directory(
self,
directory_path: str,
recursive: bool = True
) -> List[Document]:
"""Load all supported files from a directory.
Args:
directory_path: Path to the directory.
recursive: Whether to search recursively.
Returns:
List of loaded documents.
"""
path = Path(directory_path)
if not path.exists():
raise FileNotFoundError(f"Directory not found: {directory_path}")
if not path.is_dir():
raise ValueError(f"Not a directory: {directory_path}")
documents = []
pattern = "**/*" if recursive else "*"
for file_path in path.glob(pattern):
if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS:
try:
doc = self.load_file(str(file_path))
documents.append(doc)
print(f"Loaded: {file_path.name}")
except Exception as e:
print(f"Warning: Failed to load {file_path.name}: {e}")
return documents
def _load_by_extension(self, path: Path, extension: str) -> str:
"""Load file content based on extension.
Args:
path: File path.
extension: File extension.
Returns:
File content as string.
"""
if extension in {".txt", ".md"}:
return self._load_text(path)
elif extension == ".pdf":
return self._load_pdf(path)
elif extension == ".docx":
return self._load_docx(path)
else:
raise ValueError(f"Unknown extension: {extension}")
def _load_text(self, path: Path) -> str:
"""Load plain text file."""
return path.read_text(encoding="utf-8")
def _load_pdf(self, path: Path) -> str:
"""Load PDF file."""
try:
from pypdf import PdfReader
except ImportError:
raise ImportError("pypdf is required for PDF files: pip install pypdf")
reader = PdfReader(str(path))
text_parts = []
for page in reader.pages:
text = page.extract_text()
if text:
text_parts.append(text)
return "\n\n".join(text_parts)
def _load_docx(self, path: Path) -> str:
"""Load DOCX file."""
try:
from docx import Document as DocxDocument
except ImportError:
raise ImportError("python-docx is required for DOCX files: pip install python-docx")
doc = DocxDocument(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return "\n\n".join(paragraphs)