Spaces:

vn6295337
/

RAG-document-assistant

Running

App Files Files Community

RAG-document-assistant / src /ingestion /load_docs.py

vn6295337

Add Docling integration for multi-format document processing

7e07738 about 1 month ago

raw

history blame contribute delete

7.28 kB

	# RAG-document-assistant/ingestion/load_docs.py
	"""
	Document loader for RAG ingestion.

	Provides:
	- load_markdown_docs(): Legacy markdown-only loader
	- load_documents(): Unified loader (uses Docling if available, falls back to markdown)

	CLI:
	> python3 load_docs.py /full/path/to/your/docs/folder
	prints a summary table for each file and exits with code 0.
	"""

	import os
	import glob
	import argparse
	import re
	import logging
	from typing import List, Dict, Optional

	logger = logging.getLogger(__name__)

	def _clean_markdown(text: str) -> str:
	"""
	Clean markdown text by removing code blocks, HTML tags, and other non-content elements.

	Args:
	text: Raw markdown text to clean

	Returns:
	Cleaned text with markdown syntax removed
	"""
	# Remove code fences and their contents
	text = re.sub(r"```.*?```", " ", text, flags=re.DOTALL)
	# Remove HTML tags
	text = re.sub(r"<[^>]+>", " ", text)
	# Remove images/links syntax but keep alt/text
	text = re.sub(r"!\[([^\]])\]\([^\)]\)", r"\1", text)
	text = re.sub(r"\[([^\]]+)\]\([^\)]*\)", r"\1", text)
	# Remove front-matter delimited by --- at top
	text = re.sub(r"^---.?---\s", " ", text, flags=re.DOTALL)
	# Collapse whitespace
	text = re.sub(r"\s+", " ", text).strip()
	return text

	def load_markdown_docs(dir_path: str, ext: str = ".md", max_chars: int = 20000) -> List[Dict]:
	"""
	Load markdown files from dir_path (non-recursive). Returns list of metadata+clean text.
	Skips files larger than max_chars (useful to enforce 'under 5 pages' rule roughly).

	Args:
	dir_path: Path to directory containing markdown files
	ext: File extension to look for (default: ".md")
	max_chars: Maximum number of characters to accept (default: 20000)

	Returns:
	List of document dictionaries with metadata and cleaned text

	Raises:
	FileNotFoundError: If directory does not exist
	ValueError: If max_chars is not positive
	OSError: If there are issues reading files
	"""
	if max_chars <= 0:
	raise ValueError(f"max_chars must be positive, got {max_chars}")

	path = os.path.expanduser(dir_path)
	if not os.path.isdir(path):
	raise FileNotFoundError(f"Directory not found: {path}")

	pattern = os.path.join(path, f"*{ext}")
	files = sorted(glob.glob(pattern))
	docs = []
	for fp in files:
	try:
	with open(fp, "r", encoding="utf-8") as f:
	raw = f.read()
	except Exception as e:
	# Skip files that cannot be read
	docs.append({
	"filename": os.path.basename(fp),
	"path": fp,
	"text": None,
	"chars": 0,
	"words": 0,
	"status": f"ERROR_READING_FILE: {str(e)}"
	})
	continue

	cleaned = _clean_markdown(raw)
	chars = len(cleaned)
	words = len(cleaned.split())
	if chars == 0:
	# skip empty files
	continue
	if chars > max_chars:
	# skip or trim large files; here we skip and report
	docs.append({
	"filename": os.path.basename(fp),
	"path": fp,
	"text": None,
	"chars": chars,
	"words": words,
	"status": "SKIPPED_TOO_LARGE"
	})
	continue
	docs.append({
	"filename": os.path.basename(fp),
	"path": fp,
	"text": cleaned,
	"chars": chars,
	"words": words,
	"status": "OK"
	})
	return docs

	def print_summary(docs: List[Dict]):
	if not docs:
	print("No markdown files found or all were skipped.")
	return
	print(f"{'FILENAME':40} {'STATUS':15} {'CHARS':>8} {'WORDS':>8}")
	print("-" * 80)
	for d in docs:
	name = d.get("filename", "")[:40]
	status = d.get("status", "")
	chars = d.get("chars", 0)
	words = d.get("words", 0)
	print(f"{name:40} {status:15} {chars:8d} {words:8d}")
	ok_count = sum(1 for d in docs if d.get("status") == "OK")
	skipped = len(docs) - ok_count
	print("-" * 80)
	print(f"Total files: {len(docs)} OK: {ok_count} Skipped: {skipped}")

	# Try to import Docling loader
	DOCLING_AVAILABLE = False
	try:
	from src.ingestion.docling_loader import (
	load_documents_with_docling,
	convert_to_legacy_format,
	print_summary as docling_print_summary,
	SUPPORTED_EXTENSIONS
	)
	DOCLING_AVAILABLE = True
	except ImportError:
	SUPPORTED_EXTENSIONS = {".md", ".markdown"}


	def load_documents(
	dir_path: str,
	extensions: Optional[List[str]] = None,
	max_chars: int = 50000,
	use_docling: bool = True,
	recursive: bool = False
	) -> List[Dict]:
	"""
	Unified document loader - uses Docling if available, falls back to markdown.

	Args:
	dir_path: Path to directory containing documents
	extensions: File extensions to process (None = all supported)
	max_chars: Maximum characters per document
	use_docling: Prefer Docling if available
	recursive: Search subdirectories

	Returns:
	List of document dicts with text and metadata
	"""
	if use_docling and DOCLING_AVAILABLE:
	logger.info("Using Docling multi-format loader")
	parsed = load_documents_with_docling(
	dir_path,
	extensions=extensions,
	max_chars=max_chars,
	recursive=recursive
	)
	return convert_to_legacy_format(parsed)
	else:
	logger.info("Using legacy markdown loader")
	ext = ".md"
	if extensions and len(extensions) > 0:
	ext = extensions[0] if extensions[0].startswith(".") else f".{extensions[0]}"
	return load_markdown_docs(dir_path, ext=ext, max_chars=max_chars)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Load and summarize documents for RAG ingestion."
	)
	parser.add_argument("dir", help="Directory containing documents")
	parser.add_argument(
	"--ext", "-e",
	nargs="+",
	default=None,
	help="File extensions to load (default: all supported)"
	)
	parser.add_argument(
	"--max-chars",
	type=int,
	default=50000,
	help="Max characters to accept (default 50k)"
	)
	parser.add_argument(
	"--no-docling",
	action="store_true",
	help="Disable Docling, use markdown-only loader"
	)
	parser.add_argument(
	"--recursive", "-r",
	action="store_true",
	help="Search subdirectories"
	)
	args = parser.parse_args()

	if args.no_docling or not DOCLING_AVAILABLE:
	# Legacy markdown mode
	ext = args.ext[0] if args.ext else ".md"
	docs = load_markdown_docs(args.dir, ext=ext, max_chars=args.max_chars)
	print_summary(docs)
	else:
	# Docling multi-format mode
	parsed = load_documents_with_docling(
	args.dir,
	extensions=args.ext,
	max_chars=args.max_chars,
	recursive=args.recursive
	)
	docling_print_summary(parsed)