Spaces:

vn6295337
/

RAG-document-assistant

Sleeping

File size: 7,280 Bytes

# RAG-document-assistant/ingestion/load_docs.py
"""
Document loader for RAG ingestion.

Provides:
- load_markdown_docs(): Legacy markdown-only loader
- load_documents(): Unified loader (uses Docling if available, falls back to markdown)

CLI:
> python3 load_docs.py /full/path/to/your/docs/folder
prints a summary table for each file and exits with code 0.
"""

import os
import glob
import argparse
import re
import logging
from typing import List, Dict, Optional

logger = logging.getLogger(__name__)

def _clean_markdown(text: str) -> str:
    """
    Clean markdown text by removing code blocks, HTML tags, and other non-content elements.
    
    Args:
        text: Raw markdown text to clean
        
    Returns:
        Cleaned text with markdown syntax removed
    """
    # Remove code fences and their contents
    text = re.sub(r"```.*?```", " ", text, flags=re.DOTALL)
    # Remove HTML tags
    text = re.sub(r"<[^>]+>", " ", text)
    # Remove images/links syntax but keep alt/text
    text = re.sub(r"!\[([^\]]*)\]\([^\)]*\)", r"\1", text)
    text = re.sub(r"\[([^\]]+)\]\([^\)]*\)", r"\1", text)
    # Remove front-matter delimited by --- at top
    text = re.sub(r"^---.*?---\s*", " ", text, flags=re.DOTALL)
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

def load_markdown_docs(dir_path: str, ext: str = ".md", max_chars: int = 20000) -> List[Dict]:
    """
    Load markdown files from dir_path (non-recursive). Returns list of metadata+clean text.
    Skips files larger than max_chars (useful to enforce 'under 5 pages' rule roughly).
    
    Args:
        dir_path: Path to directory containing markdown files
        ext: File extension to look for (default: ".md")
        max_chars: Maximum number of characters to accept (default: 20000)
        
    Returns:
        List of document dictionaries with metadata and cleaned text
        
    Raises:
        FileNotFoundError: If directory does not exist
        ValueError: If max_chars is not positive
        OSError: If there are issues reading files
    """
    if max_chars <= 0:
        raise ValueError(f"max_chars must be positive, got {max_chars}")
        
    path = os.path.expanduser(dir_path)
    if not os.path.isdir(path):
        raise FileNotFoundError(f"Directory not found: {path}")

    pattern = os.path.join(path, f"*{ext}")
    files = sorted(glob.glob(pattern))
    docs = []
    for fp in files:
        try:
            with open(fp, "r", encoding="utf-8") as f:
                raw = f.read()
        except Exception as e:
            # Skip files that cannot be read
            docs.append({
                "filename": os.path.basename(fp),
                "path": fp,
                "text": None,
                "chars": 0,
                "words": 0,
                "status": f"ERROR_READING_FILE: {str(e)}"
            })
            continue
            
        cleaned = _clean_markdown(raw)
        chars = len(cleaned)
        words = len(cleaned.split())
        if chars == 0:
            # skip empty files
            continue
        if chars > max_chars:
            # skip or trim large files; here we skip and report
            docs.append({
                "filename": os.path.basename(fp),
                "path": fp,
                "text": None,
                "chars": chars,
                "words": words,
                "status": "SKIPPED_TOO_LARGE"
            })
            continue
        docs.append({
            "filename": os.path.basename(fp),
            "path": fp,
            "text": cleaned,
            "chars": chars,
            "words": words,
            "status": "OK"
        })
    return docs

def print_summary(docs: List[Dict]):
    if not docs:
        print("No markdown files found or all were skipped.")
        return
    print(f"{'FILENAME':40} {'STATUS':15} {'CHARS':>8} {'WORDS':>8}")
    print("-" * 80)
    for d in docs:
        name = d.get("filename", "")[:40]
        status = d.get("status", "")
        chars = d.get("chars", 0)
        words = d.get("words", 0)
        print(f"{name:40} {status:15} {chars:8d} {words:8d}")
    ok_count = sum(1 for d in docs if d.get("status") == "OK")
    skipped = len(docs) - ok_count
    print("-" * 80)
    print(f"Total files: {len(docs)}  OK: {ok_count}  Skipped: {skipped}")

# Try to import Docling loader
DOCLING_AVAILABLE = False
try:
    from src.ingestion.docling_loader import (
        load_documents_with_docling,
        convert_to_legacy_format,
        print_summary as docling_print_summary,
        SUPPORTED_EXTENSIONS
    )
    DOCLING_AVAILABLE = True
except ImportError:
    SUPPORTED_EXTENSIONS = {".md", ".markdown"}


def load_documents(
    dir_path: str,
    extensions: Optional[List[str]] = None,
    max_chars: int = 50000,
    use_docling: bool = True,
    recursive: bool = False
) -> List[Dict]:
    """
    Unified document loader - uses Docling if available, falls back to markdown.

    Args:
        dir_path: Path to directory containing documents
        extensions: File extensions to process (None = all supported)
        max_chars: Maximum characters per document
        use_docling: Prefer Docling if available
        recursive: Search subdirectories

    Returns:
        List of document dicts with text and metadata
    """
    if use_docling and DOCLING_AVAILABLE:
        logger.info("Using Docling multi-format loader")
        parsed = load_documents_with_docling(
            dir_path,
            extensions=extensions,
            max_chars=max_chars,
            recursive=recursive
        )
        return convert_to_legacy_format(parsed)
    else:
        logger.info("Using legacy markdown loader")
        ext = ".md"
        if extensions and len(extensions) > 0:
            ext = extensions[0] if extensions[0].startswith(".") else f".{extensions[0]}"
        return load_markdown_docs(dir_path, ext=ext, max_chars=max_chars)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Load and summarize documents for RAG ingestion."
    )
    parser.add_argument("dir", help="Directory containing documents")
    parser.add_argument(
        "--ext", "-e",
        nargs="+",
        default=None,
        help="File extensions to load (default: all supported)"
    )
    parser.add_argument(
        "--max-chars",
        type=int,
        default=50000,
        help="Max characters to accept (default 50k)"
    )
    parser.add_argument(
        "--no-docling",
        action="store_true",
        help="Disable Docling, use markdown-only loader"
    )
    parser.add_argument(
        "--recursive", "-r",
        action="store_true",
        help="Search subdirectories"
    )
    args = parser.parse_args()

    if args.no_docling or not DOCLING_AVAILABLE:
        # Legacy markdown mode
        ext = args.ext[0] if args.ext else ".md"
        docs = load_markdown_docs(args.dir, ext=ext, max_chars=args.max_chars)
        print_summary(docs)
    else:
        # Docling multi-format mode
        parsed = load_documents_with_docling(
            args.dir,
            extensions=args.ext,
            max_chars=args.max_chars,
            recursive=args.recursive
        )
        docling_print_summary(parsed)