Spaces:

Mobiworks
/

rag-chatbot

Running

App Files Files Community

rag-chatbot / components /document_loader.py

Mobiworks

Sync from GitHub via hub-sync

c237f60 verified 16 days ago

Raw

History Blame Contribute Delete

3.42 kB

	"""
	document_loader.py
	------------------
	Loads and parses documents from a file path or directory.
	Supports PDF, TXT, DOCX, HTML, and Markdown (.md).
	Returns a list of LangChain Document objects with metadata
	(source filename, page number where available).

	Bug Fix (Step 1):
	- Changed setdefault to direct assignment so source metadata is always
	normalized to just the filename, not the full absolute path.

	Enhancement (Step 2):
	- Added .md (Markdown) support via TextLoader.
	"""

	import logging
	from pathlib import Path
	from typing import List

	from langchain.schema import Document
	from langchain_community.document_loaders import (
	PyMuPDFLoader,
	TextLoader,
	Docx2txtLoader,
	UnstructuredHTMLLoader,
	)

	logger = logging.getLogger(__name__)

	# Map file extensions → loader class
	LOADER_MAP = {
	".pdf": PyMuPDFLoader,
	".txt": TextLoader,
	".md": TextLoader, # Markdown support added
	".docx": Docx2txtLoader,
	".html": UnstructuredHTMLLoader,
	".htm": UnstructuredHTMLLoader,
	}


	def load_document(file_path: str \| Path) -> List[Document]:
	"""
	Load a single document and return a list of LangChain Document objects.

	Args:
	file_path: Absolute or relative path to the document.

	Returns:
	List of Document objects with `.page_content` and `.metadata`.

	Raises:
	ValueError: If the file type is not supported.
	FileNotFoundError: If the file does not exist.
	"""
	file_path = Path(file_path)

	if not file_path.exists():
	raise FileNotFoundError(f"File not found: {file_path}")

	ext = file_path.suffix.lower()
	loader_cls = LOADER_MAP.get(ext)

	if loader_cls is None:
	supported = ", ".join(LOADER_MAP.keys())
	raise ValueError(
	f"Unsupported file type '{ext}'. Supported types: {supported}"
	)

	logger.info("Loading %s with %s", file_path.name, loader_cls.__name__)
	loader = loader_cls(str(file_path))
	docs = loader.load()

	# BUG FIX: always overwrite source with just the filename.
	# LangChain loaders set source to the full absolute path by default.
	# setdefault() would leave the full path intact since the key already exists.
	# Direct assignment always normalises to just the filename.
	for doc in docs:
	doc.metadata["source"] = file_path.name

	logger.info("Loaded %d page(s) from %s", len(docs), file_path.name)
	return docs


	def load_documents_from_directory(directory: str \| Path) -> List[Document]:
	"""
	Recursively load all supported documents from a directory.

	Args:
	directory: Path to the folder containing source documents.

	Returns:
	Flat list of Document objects from all files found.
	"""
	directory = Path(directory)

	if not directory.is_dir():
	raise NotADirectoryError(f"Not a directory: {directory}")

	all_docs: List[Document] = []
	supported_exts = set(LOADER_MAP.keys())

	for file_path in sorted(directory.rglob("*")):
	if file_path.suffix.lower() in supported_exts:
	try:
	docs = load_document(file_path)
	all_docs.extend(docs)
	except Exception as exc:
	logger.warning("Skipping %s — %s", file_path.name, exc)

	logger.info(
	"Loaded %d document chunk(s) from directory '%s'",
	len(all_docs),
	directory,
	)
	return all_docs