|
|
"""Document loader for various file formats.""" |
|
|
|
|
|
from dataclasses import dataclass, field |
|
|
from pathlib import Path |
|
|
from typing import List, Optional, Dict, Any |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class Document: |
|
|
"""Represents a loaded document.""" |
|
|
content: str |
|
|
metadata: Dict[str, Any] = field(default_factory=dict) |
|
|
|
|
|
@property |
|
|
def source(self) -> str: |
|
|
"""Get document source path.""" |
|
|
return self.metadata.get("source", "unknown") |
|
|
|
|
|
|
|
|
class DocumentLoader: |
|
|
"""Load documents from various file formats.""" |
|
|
|
|
|
SUPPORTED_EXTENSIONS = {".txt", ".md", ".pdf", ".docx"} |
|
|
|
|
|
def __init__(self): |
|
|
"""Initialize the document loader.""" |
|
|
self._pdf_loader = None |
|
|
self._docx_loader = None |
|
|
|
|
|
def load_file(self, file_path: str) -> Document: |
|
|
"""Load a single file. |
|
|
|
|
|
Args: |
|
|
file_path: Path to the file. |
|
|
|
|
|
Returns: |
|
|
Loaded document. |
|
|
|
|
|
Raises: |
|
|
ValueError: If file format is not supported. |
|
|
FileNotFoundError: If file doesn't exist. |
|
|
""" |
|
|
path = Path(file_path) |
|
|
|
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"File not found: {file_path}") |
|
|
|
|
|
extension = path.suffix.lower() |
|
|
|
|
|
if extension not in self.SUPPORTED_EXTENSIONS: |
|
|
raise ValueError( |
|
|
f"Unsupported file format: {extension}. " |
|
|
f"Supported: {self.SUPPORTED_EXTENSIONS}" |
|
|
) |
|
|
|
|
|
content = self._load_by_extension(path, extension) |
|
|
|
|
|
return Document( |
|
|
content=content, |
|
|
metadata={ |
|
|
"source": str(path.absolute()), |
|
|
"filename": path.name, |
|
|
"extension": extension |
|
|
} |
|
|
) |
|
|
|
|
|
def load_directory( |
|
|
self, |
|
|
directory_path: str, |
|
|
recursive: bool = True |
|
|
) -> List[Document]: |
|
|
"""Load all supported files from a directory. |
|
|
|
|
|
Args: |
|
|
directory_path: Path to the directory. |
|
|
recursive: Whether to search recursively. |
|
|
|
|
|
Returns: |
|
|
List of loaded documents. |
|
|
""" |
|
|
path = Path(directory_path) |
|
|
|
|
|
if not path.exists(): |
|
|
raise FileNotFoundError(f"Directory not found: {directory_path}") |
|
|
|
|
|
if not path.is_dir(): |
|
|
raise ValueError(f"Not a directory: {directory_path}") |
|
|
|
|
|
documents = [] |
|
|
pattern = "**/*" if recursive else "*" |
|
|
|
|
|
for file_path in path.glob(pattern): |
|
|
if file_path.is_file() and file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS: |
|
|
try: |
|
|
doc = self.load_file(str(file_path)) |
|
|
documents.append(doc) |
|
|
print(f"Loaded: {file_path.name}") |
|
|
except Exception as e: |
|
|
print(f"Warning: Failed to load {file_path.name}: {e}") |
|
|
|
|
|
return documents |
|
|
|
|
|
def _load_by_extension(self, path: Path, extension: str) -> str: |
|
|
"""Load file content based on extension. |
|
|
|
|
|
Args: |
|
|
path: File path. |
|
|
extension: File extension. |
|
|
|
|
|
Returns: |
|
|
File content as string. |
|
|
""" |
|
|
if extension in {".txt", ".md"}: |
|
|
return self._load_text(path) |
|
|
elif extension == ".pdf": |
|
|
return self._load_pdf(path) |
|
|
elif extension == ".docx": |
|
|
return self._load_docx(path) |
|
|
else: |
|
|
raise ValueError(f"Unknown extension: {extension}") |
|
|
|
|
|
def _load_text(self, path: Path) -> str: |
|
|
"""Load plain text file.""" |
|
|
return path.read_text(encoding="utf-8") |
|
|
|
|
|
def _load_pdf(self, path: Path) -> str: |
|
|
"""Load PDF file.""" |
|
|
try: |
|
|
from pypdf import PdfReader |
|
|
except ImportError: |
|
|
raise ImportError("pypdf is required for PDF files: pip install pypdf") |
|
|
|
|
|
reader = PdfReader(str(path)) |
|
|
text_parts = [] |
|
|
|
|
|
for page in reader.pages: |
|
|
text = page.extract_text() |
|
|
if text: |
|
|
text_parts.append(text) |
|
|
|
|
|
return "\n\n".join(text_parts) |
|
|
|
|
|
def _load_docx(self, path: Path) -> str: |
|
|
"""Load DOCX file.""" |
|
|
try: |
|
|
from docx import Document as DocxDocument |
|
|
except ImportError: |
|
|
raise ImportError("python-docx is required for DOCX files: pip install python-docx") |
|
|
|
|
|
doc = DocxDocument(str(path)) |
|
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] |
|
|
return "\n\n".join(paragraphs) |
|
|
|