rag-chatbot / components /document_loader.py
Mobiworks's picture
Sync from GitHub via hub-sync
c237f60 verified
Raw
History Blame Contribute Delete
3.42 kB
"""
document_loader.py
------------------
Loads and parses documents from a file path or directory.
Supports PDF, TXT, DOCX, HTML, and Markdown (.md).
Returns a list of LangChain Document objects with metadata
(source filename, page number where available).
Bug Fix (Step 1):
- Changed setdefault to direct assignment so source metadata is always
normalized to just the filename, not the full absolute path.
Enhancement (Step 2):
- Added .md (Markdown) support via TextLoader.
"""
import logging
from pathlib import Path
from typing import List
from langchain.schema import Document
from langchain_community.document_loaders import (
PyMuPDFLoader,
TextLoader,
Docx2txtLoader,
UnstructuredHTMLLoader,
)
logger = logging.getLogger(__name__)
# Map file extensions → loader class
LOADER_MAP = {
".pdf": PyMuPDFLoader,
".txt": TextLoader,
".md": TextLoader, # Markdown support added
".docx": Docx2txtLoader,
".html": UnstructuredHTMLLoader,
".htm": UnstructuredHTMLLoader,
}
def load_document(file_path: str | Path) -> List[Document]:
"""
Load a single document and return a list of LangChain Document objects.
Args:
file_path: Absolute or relative path to the document.
Returns:
List of Document objects with `.page_content` and `.metadata`.
Raises:
ValueError: If the file type is not supported.
FileNotFoundError: If the file does not exist.
"""
file_path = Path(file_path)
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
ext = file_path.suffix.lower()
loader_cls = LOADER_MAP.get(ext)
if loader_cls is None:
supported = ", ".join(LOADER_MAP.keys())
raise ValueError(
f"Unsupported file type '{ext}'. Supported types: {supported}"
)
logger.info("Loading %s with %s", file_path.name, loader_cls.__name__)
loader = loader_cls(str(file_path))
docs = loader.load()
# BUG FIX: always overwrite source with just the filename.
# LangChain loaders set source to the full absolute path by default.
# setdefault() would leave the full path intact since the key already exists.
# Direct assignment always normalises to just the filename.
for doc in docs:
doc.metadata["source"] = file_path.name
logger.info("Loaded %d page(s) from %s", len(docs), file_path.name)
return docs
def load_documents_from_directory(directory: str | Path) -> List[Document]:
"""
Recursively load all supported documents from a directory.
Args:
directory: Path to the folder containing source documents.
Returns:
Flat list of Document objects from all files found.
"""
directory = Path(directory)
if not directory.is_dir():
raise NotADirectoryError(f"Not a directory: {directory}")
all_docs: List[Document] = []
supported_exts = set(LOADER_MAP.keys())
for file_path in sorted(directory.rglob("*")):
if file_path.suffix.lower() in supported_exts:
try:
docs = load_document(file_path)
all_docs.extend(docs)
except Exception as exc:
logger.warning("Skipping %s — %s", file_path.name, exc)
logger.info(
"Loaded %d document chunk(s) from directory '%s'",
len(all_docs),
directory,
)
return all_docs