Spaces:

vn6295337
/

RAG-document-assistant

Sleeping

vn6295337 Claude Opus 4.5 commited on Jan 7

Commit

7e07738

1 Parent(s): f866820

Add Docling integration for multi-format document processing

- Add docling_loader.py for PDF, DOCX, PPTX, HTML, image support
- Add structure-aware chunking (preserves tables, sections)
- Update ingestion API with use_docling and use_structure options
- Update Dockerfile with system deps for Docling
- Remove render.yaml (using HuggingFace Spaces only)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (7) hide show

Dockerfile +8 -2
render.yaml +0 -17
requirements.txt +3 -0
src/ingestion/api.py +78 -10
src/ingestion/chunker.py +199 -2
src/ingestion/docling_loader.py +364 -0
src/ingestion/load_docs.py +100 -12

Dockerfile CHANGED Viewed

@@ -2,14 +2,20 @@ FROM python:3.11-slim
 WORKDIR /app
-# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements and install dependencies
 COPY requirements.txt .
-RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
     pip install --no-cache-dir -r requirements.txt
 # Copy application code

 WORKDIR /app
+# Install system dependencies for Docling (PDF, OCR, image processing)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements and install dependencies
 COPY requirements.txt .
+RUN pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu && \
     pip install --no-cache-dir -r requirements.txt
 # Copy application code

render.yaml DELETED Viewed

@@ -1,17 +0,0 @@
-services:
-  - type: web
-    name: rag-api
-    runtime: python
-    buildCommand: pip install -r requirements.txt
-    startCommand: uvicorn src.api.main:app --host 0.0.0.0 --port $PORT
-    envVars:
-      - key: PYTHON_VERSION
-        value: "3.11"
-      - key: PINECONE_API_KEY
-        sync: false
-      - key: PINECONE_INDEX_NAME
-        sync: false
-      - key: GEMINI_API_KEY
-        sync: false
-      - key: GROQ_API_KEY
-        sync: false

requirements.txt CHANGED Viewed

@@ -10,3 +10,6 @@ requests>=2.31.0
 python-dotenv>=1.0.0
 rank-bm25>=0.2.2
 PyPDF2>=3.0.0

 python-dotenv>=1.0.0
 rank-bm25>=0.2.2
 PyPDF2>=3.0.0
+# Document processing
+docling>=2.15.0

src/ingestion/api.py CHANGED Viewed

@@ -3,18 +3,37 @@ Ingestion API for UI integration.
 Provides functions to ingest documents from a directory
 and optionally sync to Pinecone.
 """
 import json
 import os
 from pathlib import Path
 from typing import Dict, Any, List, Optional
 from dataclasses import dataclass
 from src.ingestion.load_docs import load_markdown_docs
-from src.ingestion.chunker import chunk_documents
 from src.ingestion.embeddings import batch_embed_chunks
 @dataclass
 class IngestionResult:
@@ -38,7 +57,11 @@ def ingest_from_directory(
     docs_dir: str,
     output_path: str = "data/chunks.jsonl",
     provider: str = "sentence-transformers",
-    dim: int = 384
 ) -> IngestionResult:
     """
     Ingest documents from a directory and save to chunks.jsonl.
@@ -48,6 +71,10 @@ def ingest_from_directory(
         output_path: Path to save chunks.jsonl
         provider: Embedding provider ("sentence-transformers" or "local")
         dim: Embedding dimension
     Returns:
         IngestionResult with status and counts
@@ -65,8 +92,20 @@ def ingest_from_directory(
         )
     try:
-        # Load documents
-        docs = load_markdown_docs(docs_dir)
         if not docs:
             return IngestionResult(
                 status="warning",
@@ -77,10 +116,19 @@ def ingest_from_directory(
             )
         # Count successful loads
-        doc_count = len([d for d in docs if d.get("status") == "ok"])
-        # Chunk documents
-        chunks = chunk_documents(docs, max_tokens=300, overlap=50)
         if not chunks:
             return IngestionResult(
                 status="warning",
@@ -93,12 +141,15 @@ def ingest_from_directory(
         # Generate embeddings
         embedded = batch_embed_chunks(chunks, provider=provider, dim=dim)
-        # Merge text back into embedded chunks
-        chunk_map = {(c["filename"], c["chunk_id"]): c["text"] for c in chunks}
         for e in embedded:
             key = (e["filename"], e["chunk_id"])
             if key in chunk_map:
-                e["text"] = chunk_map[key]
         # Save to file
         save_path = Path(output_path)
@@ -112,6 +163,8 @@ def ingest_from_directory(
                     "chunk_id": e["chunk_id"],
                     "text": e.get("text", ""),
                     "chars": e.get("chars", 0),
                     "embedding": e["embedding"]
                 }
                 fh.write(json.dumps(obj, ensure_ascii=False) + "\n")
@@ -125,6 +178,7 @@ def ingest_from_directory(
         )
     except Exception as e:
         return IngestionResult(
             status="error",
             documents=0,
@@ -239,6 +293,20 @@ def sync_to_pinecone(
         )
 def get_index_status(chunks_path: str = "data/chunks.jsonl") -> Dict[str, Any]:
     """
     Get status of the current index.

 Provides functions to ingest documents from a directory
 and optionally sync to Pinecone.
+Supports both legacy markdown-only loading and multi-format
+loading via Docling.
 """
 import json
 import os
+import logging
 from pathlib import Path
 from typing import Dict, Any, List, Optional
 from dataclasses import dataclass
 from src.ingestion.load_docs import load_markdown_docs
+from src.ingestion.chunker import chunk_documents, chunk_documents_with_structure
 from src.ingestion.embeddings import batch_embed_chunks
+logger = logging.getLogger(__name__)
+# Try to import Docling loader (optional dependency)
+DOCLING_AVAILABLE = False
+try:
+    from src.ingestion.docling_loader import (
+        load_documents_with_docling,
+        convert_to_legacy_format,
+        SUPPORTED_EXTENSIONS
+    )
+    DOCLING_AVAILABLE = True
+except ImportError:
+    logger.info("Docling not available, using markdown-only loader")
+    SUPPORTED_EXTENSIONS = {".md", ".markdown"}
 @dataclass
 class IngestionResult:
     docs_dir: str,
     output_path: str = "data/chunks.jsonl",
     provider: str = "sentence-transformers",
+    dim: int = 384,
+    use_docling: bool = True,
+    extensions: Optional[List[str]] = None,
+    use_structure: bool = True,
+    recursive: bool = False
 ) -> IngestionResult:
     """
     Ingest documents from a directory and save to chunks.jsonl.
         output_path: Path to save chunks.jsonl
         provider: Embedding provider ("sentence-transformers" or "local")
         dim: Embedding dimension
+        use_docling: Use Docling for multi-format parsing (if available)
+        extensions: File extensions to process (None = all supported)
+        use_structure: Use structure-aware chunking (requires Docling)
+        recursive: Search subdirectories recursively
     Returns:
         IngestionResult with status and counts
         )
     try:
+        # Choose loader based on availability and preference
+        if use_docling and DOCLING_AVAILABLE:
+            logger.info("Using Docling for multi-format document loading")
+            parsed_docs = load_documents_with_docling(
+                docs_dir,
+                extensions=extensions,
+                recursive=recursive
+            )
+            docs = convert_to_legacy_format(parsed_docs)
+        else:
+            logger.info("Using legacy markdown loader")
+            docs = load_markdown_docs(docs_dir)
+            use_structure = False  # No structure without Docling
         if not docs:
             return IngestionResult(
                 status="warning",
             )
         # Count successful loads
+        doc_count = len([d for d in docs if d.get("status") == "OK"])
+        # Chunk documents (structure-aware or legacy)
+        if use_structure and DOCLING_AVAILABLE:
+            chunks = chunk_documents_with_structure(
+                docs,
+                max_tokens=300,
+                overlap=50,
+                use_structure=True
+            )
+        else:
+            chunks = chunk_documents(docs, max_tokens=300, overlap=50)
         if not chunks:
             return IngestionResult(
                 status="warning",
         # Generate embeddings
         embedded = batch_embed_chunks(chunks, provider=provider, dim=dim)
+        # Merge text and metadata back into embedded chunks
+        chunk_map = {(c["filename"], c["chunk_id"]): c for c in chunks}
         for e in embedded:
             key = (e["filename"], e["chunk_id"])
             if key in chunk_map:
+                src = chunk_map[key]
+                e["text"] = src.get("text", "")
+                e["element_type"] = src.get("element_type", "text")
+                e["section_heading"] = src.get("section_heading", "")
         # Save to file
         save_path = Path(output_path)
                     "chunk_id": e["chunk_id"],
                     "text": e.get("text", ""),
                     "chars": e.get("chars", 0),
+                    "element_type": e.get("element_type", "text"),
+                    "section_heading": e.get("section_heading", ""),
                     "embedding": e["embedding"]
                 }
                 fh.write(json.dumps(obj, ensure_ascii=False) + "\n")
         )
     except Exception as e:
+        logger.exception("Ingestion failed")
         return IngestionResult(
             status="error",
             documents=0,
         )
+def get_supported_formats() -> Dict[str, Any]:
+    """
+    Get information about supported document formats.
+    Returns:
+        Dict with docling availability and supported extensions
+    """
+    return {
+        "docling_available": DOCLING_AVAILABLE,
+        "supported_extensions": list(SUPPORTED_EXTENSIONS),
+        "loader": "docling" if DOCLING_AVAILABLE else "markdown-only"
+    }
 def get_index_status(chunks_path: str = "data/chunks.jsonl") -> Dict[str, Any]:
     """
     Get status of the current index.

src/ingestion/chunker.py CHANGED Viewed

@@ -1,11 +1,15 @@
 # RAG-document-assistant/ingestion/chunker.py
 """
 Text chunking utility for RAG ingestion.
-Inputs: list of docs from load_docs.py
 Output: list of chunks with metadata
 """
-from typing import List, Dict
 def chunk_text(
     text: str,
@@ -98,6 +102,199 @@ def chunk_documents(docs: List[Dict], max_tokens: int = 300, overlap: int = 50):
     return all_chunks
 if __name__ == "__main__":
     # Minimal test
     sample = "This is a test text " * 200

 # RAG-document-assistant/ingestion/chunker.py
 """
 Text chunking utility for RAG ingestion.
+Inputs: list of docs from load_docs.py or docling_loader.py
 Output: list of chunks with metadata
+Supports:
+- Simple character-based chunking (legacy)
+- Structure-aware chunking using Docling elements
 """
+from typing import List, Dict, Optional, Any
 def chunk_text(
     text: str,
     return all_chunks
+def chunk_by_structure(
+    elements: List[Any],
+    max_tokens: int = 300,
+    overlap: int = 50,
+    keep_tables_intact: bool = True,
+    include_heading_context: bool = True
+) -> List[Dict]:
+    """
+    Structure-aware chunking using Docling document elements.
+    Groups content by semantic boundaries (headings, tables) rather than
+    arbitrary character counts. Falls back to character-based splitting
+    for oversized elements.
+    Args:
+        elements: List of DocumentElement objects from docling_loader
+        max_tokens: Maximum tokens per chunk (approx 4 chars/token)
+        overlap: Token overlap for split elements
+        keep_tables_intact: Keep tables as single chunks even if large
+        include_heading_context: Prepend parent heading to chunks
+    Returns:
+        List of chunk dicts with element_type and section metadata
+    """
+    if not elements:
+        return []
+    max_chars = max_tokens * 4
+    chunks = []
+    current_heading = ""
+    current_section = []
+    current_chars = 0
+    def flush_section():
+        """Flush accumulated section content as a chunk."""
+        nonlocal current_section, current_chars
+        if not current_section:
+            return
+        combined_text = "\n\n".join(el.text for el in current_section)
+        if combined_text.strip():
+            # Prepend heading context if available
+            if include_heading_context and current_heading:
+                combined_text = f"## {current_heading}\n\n{combined_text}"
+            chunks.append({
+                "text": combined_text.strip(),
+                "chars": len(combined_text),
+                "element_type": "section",
+                "section_heading": current_heading,
+                "element_count": len(current_section)
+            })
+        current_section = []
+        current_chars = 0
+    for element in elements:
+        el_type = getattr(element, "element_type", "paragraph")
+        el_text = getattr(element, "text", str(element))
+        el_chars = len(el_text)
+        # Handle headings - start new section
+        if el_type == "heading":
+            flush_section()
+            current_heading = el_text
+            continue
+        # Handle tables - keep intact if configured
+        if el_type == "table" and keep_tables_intact:
+            flush_section()
+            table_text = el_text
+            if include_heading_context and current_heading:
+                table_text = f"## {current_heading}\n\n{el_text}"
+            chunks.append({
+                "text": table_text.strip(),
+                "chars": len(table_text),
+                "element_type": "table",
+                "section_heading": current_heading,
+                "element_count": 1
+            })
+            continue
+        # Check if adding this element exceeds limit
+        if current_chars + el_chars > max_chars and current_section:
+            flush_section()
+        # Handle oversized single elements
+        if el_chars > max_chars:
+            flush_section()
+            # Split large element using character-based chunking
+            sub_chunks = chunk_text(el_text, max_tokens=max_tokens, overlap=overlap)
+            for i, sub_text in enumerate(sub_chunks):
+                prefix = ""
+                if include_heading_context and current_heading:
+                    prefix = f"## {current_heading}\n\n"
+                chunks.append({
+                    "text": f"{prefix}{sub_text}".strip(),
+                    "chars": len(sub_text) + len(prefix),
+                    "element_type": f"{el_type}_split",
+                    "section_heading": current_heading,
+                    "split_index": i,
+                    "element_count": 1
+                })
+            continue
+        # Accumulate element in current section
+        current_section.append(element)
+        current_chars += el_chars
+    # Flush remaining content
+    flush_section()
+    return chunks
+def chunk_documents_with_structure(
+    docs: List[Dict],
+    max_tokens: int = 300,
+    overlap: int = 50,
+    keep_tables_intact: bool = True,
+    use_structure: bool = True
+) -> List[Dict]:
+    """
+    Chunk documents using structure-aware or legacy chunking.
+    Args:
+        docs: List of document dicts (from docling_loader or load_docs)
+        max_tokens: Maximum tokens per chunk
+        overlap: Token overlap between chunks
+        keep_tables_intact: Keep tables as single chunks
+        use_structure: Use structure-aware chunking if elements available
+    Returns:
+        List of chunk dicts with metadata
+    """
+    if not isinstance(docs, list):
+        raise TypeError("docs must be a list")
+    all_chunks = []
+    for d in docs:
+        if not isinstance(d, dict):
+            raise TypeError("Each document must be a dictionary")
+        status = d.get("status", "")
+        if status != "OK":
+            continue
+        filename = d.get("filename", "unknown")
+        elements = d.get("elements", [])
+        # Use structure-aware chunking if elements available
+        if use_structure and elements:
+            raw_chunks = chunk_by_structure(
+                elements,
+                max_tokens=max_tokens,
+                overlap=overlap,
+                keep_tables_intact=keep_tables_intact
+            )
+            for i, ch in enumerate(raw_chunks):
+                all_chunks.append({
+                    "filename": filename,
+                    "chunk_id": i,
+                    "text": ch["text"],
+                    "chars": ch["chars"],
+                    "element_type": ch.get("element_type", "section"),
+                    "section_heading": ch.get("section_heading", ""),
+                    "format": d.get("format", ""),
+                    "page_count": d.get("page_count", 0)
+                })
+        else:
+            # Fallback to legacy text-based chunking
+            text = d.get("text", "")
+            if not text:
+                continue
+            raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)
+            for i, ch in enumerate(raw_chunks):
+                all_chunks.append({
+                    "filename": filename,
+                    "chunk_id": i,
+                    "text": ch,
+                    "chars": len(ch),
+                    "element_type": "text",
+                    "section_heading": "",
+                    "format": d.get("format", ".md"),
+                    "page_count": 0
+                })
+    return all_chunks
 if __name__ == "__main__":
     # Minimal test
     sample = "This is a test text " * 200

src/ingestion/docling_loader.py ADDED Viewed

	@@ -0,0 +1,364 @@

+"""
+Docling-based document loader for multi-format document processing.
+Supports: PDF, DOCX, PPTX, HTML, images, and Markdown.
+Provides structure-aware parsing with table extraction and hierarchy preservation.
+"""
+import os
+import glob
+from pathlib import Path
+from typing import List, Dict, Optional, Any
+from dataclasses import dataclass, field
+import logging
+logger = logging.getLogger(__name__)
+# Supported file extensions
+SUPPORTED_EXTENSIONS = {
+    ".pdf", ".docx", ".pptx", ".xlsx",
+    ".html", ".htm",
+    ".md", ".markdown",
+    ".png", ".jpg", ".jpeg", ".tiff", ".bmp"
+}
+@dataclass
+class DocumentElement:
+    """Represents a structural element in a document."""
+    element_type: str  # paragraph, table, heading, list, code, image
+    text: str
+    level: int = 0  # heading level (1-6) or nesting depth
+    metadata: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class ParsedDocument:
+    """Result of parsing a document with Docling."""
+    filename: str
+    path: str
+    elements: List[DocumentElement]
+    format: str
+    page_count: int = 0
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    status: str = "OK"
+    error: Optional[str] = None
+    @property
+    def full_text(self) -> str:
+        """Get concatenated text from all elements."""
+        return "\n\n".join(el.text for el in self.elements if el.text.strip())
+    @property
+    def chars(self) -> int:
+        return len(self.full_text)
+    @property
+    def words(self) -> int:
+        return len(self.full_text.split())
+def _get_docling_converter():
+    """Lazy load Docling converter to avoid import overhead."""
+    try:
+        from docling.document_converter import DocumentConverter
+        return DocumentConverter()
+    except ImportError as e:
+        logger.error(f"Docling not installed: {e}")
+        raise ImportError(
+            "Docling is required for multi-format document loading. "
+            "Install with: pip install docling"
+        ) from e
+def _extract_elements_from_docling(doc_result) -> List[DocumentElement]:
+    """
+    Extract structured elements from a Docling conversion result.
+    Args:
+        doc_result: Docling ConversionResult object
+    Returns:
+        List of DocumentElement objects
+    """
+    elements = []
+    try:
+        # Get the DoclingDocument
+        docling_doc = doc_result.document
+        # Iterate through document items
+        for item, level in docling_doc.iterate_items():
+            item_type = item.__class__.__name__.lower()
+            # Map Docling item types to our element types
+            if "heading" in item_type or "title" in item_type:
+                el_type = "heading"
+                el_level = getattr(item, "level", 1)
+            elif "table" in item_type:
+                el_type = "table"
+                el_level = 0
+            elif "list" in item_type:
+                el_type = "list"
+                el_level = level
+            elif "code" in item_type:
+                el_type = "code"
+                el_level = 0
+            elif "image" in item_type or "figure" in item_type:
+                el_type = "image"
+                el_level = 0
+            else:
+                el_type = "paragraph"
+                el_level = level
+            # Extract text content
+            text = ""
+            if hasattr(item, "text") and item.text:
+                text = item.text
+            elif hasattr(item, "export_to_markdown"):
+                try:
+                    # Some items require doc parameter
+                    text = item.export_to_markdown(docling_doc)
+                except TypeError:
+                    try:
+                        text = item.export_to_markdown()
+                    except Exception:
+                        text = str(item) if hasattr(item, "__str__") else ""
+            elif hasattr(item, "__str__"):
+                text = str(item)
+            if text and text.strip():
+                elements.append(DocumentElement(
+                    element_type=el_type,
+                    text=text.strip(),
+                    level=el_level,
+                    metadata={
+                        "original_type": item_type,
+                        "depth": level
+                    }
+                ))
+    except Exception as e:
+        logger.warning(f"Error extracting elements: {e}")
+        # Fallback: try to get markdown export
+        try:
+            md_text = doc_result.document.export_to_markdown()
+            if md_text:
+                elements.append(DocumentElement(
+                    element_type="paragraph",
+                    text=md_text,
+                    level=0
+                ))
+        except Exception:
+            pass
+    return elements
+def load_document_with_docling(file_path: str) -> ParsedDocument:
+    """
+    Load a single document using Docling.
+    Args:
+        file_path: Path to the document file
+    Returns:
+        ParsedDocument with extracted structure and content
+    """
+    path = Path(file_path)
+    if not path.exists():
+        return ParsedDocument(
+            filename=path.name,
+            path=str(path),
+            elements=[],
+            format=path.suffix.lower(),
+            status="ERROR",
+            error=f"File not found: {file_path}"
+        )
+    ext = path.suffix.lower()
+    if ext not in SUPPORTED_EXTENSIONS:
+        return ParsedDocument(
+            filename=path.name,
+            path=str(path),
+            elements=[],
+            format=ext,
+            status="SKIPPED",
+            error=f"Unsupported format: {ext}"
+        )
+    try:
+        converter = _get_docling_converter()
+        result = converter.convert(str(path))
+        elements = _extract_elements_from_docling(result)
+        # Get page count if available
+        page_count = 0
+        try:
+            if hasattr(result.document, "pages"):
+                page_count = len(result.document.pages)
+        except Exception:
+            pass
+        return ParsedDocument(
+            filename=path.name,
+            path=str(path),
+            elements=elements,
+            format=ext,
+            page_count=page_count,
+            metadata={
+                "converter": "docling",
+                "element_count": len(elements)
+            },
+            status="OK"
+        )
+    except Exception as e:
+        logger.error(f"Error processing {file_path}: {e}")
+        return ParsedDocument(
+            filename=path.name,
+            path=str(path),
+            elements=[],
+            format=ext,
+            status="ERROR",
+            error=str(e)
+        )
+def load_documents_with_docling(
+    dir_path: str,
+    extensions: Optional[List[str]] = None,
+    max_chars: int = 50000,
+    recursive: bool = False
+) -> List[ParsedDocument]:
+    """
+    Load multiple documents from a directory using Docling.
+    Args:
+        dir_path: Path to directory containing documents
+        extensions: List of extensions to process (default: all supported)
+        max_chars: Maximum characters per document (skip larger files)
+        recursive: Whether to search subdirectories
+    Returns:
+        List of ParsedDocument objects
+    """
+    path = Path(dir_path).expanduser()
+    if not path.is_dir():
+        raise FileNotFoundError(f"Directory not found: {dir_path}")
+    if extensions is None:
+        extensions = list(SUPPORTED_EXTENSIONS)
+    else:
+        extensions = [e if e.startswith(".") else f".{e}" for e in extensions]
+    # Find all matching files
+    files = []
+    for ext in extensions:
+        pattern = f"**/*{ext}" if recursive else f"*{ext}"
+        files.extend(path.glob(pattern))
+    files = sorted(set(files))
+    documents = []
+    for file_path in files:
+        doc = load_document_with_docling(str(file_path))
+        # Check size limit
+        if doc.status == "OK" and doc.chars > max_chars:
+            doc.status = "SKIPPED_TOO_LARGE"
+            doc.error = f"Document exceeds {max_chars} chars ({doc.chars})"
+            doc.elements = []
+        documents.append(doc)
+    return documents
+def convert_to_legacy_format(docs: List[ParsedDocument]) -> List[Dict]:
+    """
+    Convert ParsedDocument list to legacy format for backward compatibility.
+    Args:
+        docs: List of ParsedDocument objects
+    Returns:
+        List of dicts matching load_markdown_docs output format
+    """
+    legacy = []
+    for doc in docs:
+        legacy.append({
+            "filename": doc.filename,
+            "path": doc.path,
+            "text": doc.full_text if doc.status == "OK" else None,
+            "chars": doc.chars,
+            "words": doc.words,
+            "status": doc.status,
+            "format": doc.format,
+            "elements": doc.elements,  # Additional: structured elements
+            "page_count": doc.page_count,
+            "metadata": doc.metadata
+        })
+    return legacy
+def print_summary(docs: List[ParsedDocument]):
+    """Print summary of loaded documents."""
+    if not docs:
+        print("No documents found or all were skipped.")
+        return
+    print(f"{'FILENAME':40} {'FORMAT':8} {'STATUS':20} {'CHARS':>8} {'ELEMENTS':>8}")
+    print("-" * 90)
+    for d in docs:
+        name = d.filename[:40]
+        fmt = d.format[:8]
+        status = d.status[:20]
+        chars = d.chars
+        elements = len(d.elements)
+        print(f"{name:40} {fmt:8} {status:20} {chars:8d} {elements:8d}")
+    ok_count = sum(1 for d in docs if d.status == "OK")
+    skipped = len(docs) - ok_count
+    print("-" * 90)
+    print(f"Total: {len(docs)}  OK: {ok_count}  Skipped/Errors: {skipped}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Load documents using Docling for RAG ingestion."
+    )
+    parser.add_argument("dir", help="Directory containing documents")
+    parser.add_argument(
+        "--extensions", "-e",
+        nargs="+",
+        default=None,
+        help="File extensions to process (default: all supported)"
+    )
+    parser.add_argument(
+        "--max-chars",
+        type=int,
+        default=50000,
+        help="Max characters to accept (default: 50000)"
+    )
+    parser.add_argument(
+        "--recursive", "-r",
+        action="store_true",
+        help="Search subdirectories recursively"
+    )
+    args = parser.parse_args()
+    docs = load_documents_with_docling(
+        args.dir,
+        extensions=args.extensions,
+        max_chars=args.max_chars,
+        recursive=args.recursive
+    )
+    print_summary(docs)

src/ingestion/load_docs.py CHANGED Viewed

@@ -1,13 +1,13 @@
 # RAG-document-assistant/ingestion/load_docs.py
 """
-Simple markdown document loader for Day-3 ingestion step.
-Functions:
-- load_markdown_docs(dir_path, ext='.md', max_chars=20000)
-  -> returns list of dicts: { "filename", "path", "text", "chars", "words" }
 CLI:
-> python3 load_docs.py /full/path/to/your/markdown/folder
 prints a summary table for each file and exits with code 0.
 """
@@ -15,7 +15,10 @@ import os
 import glob
 import argparse
 import re
-from typing import List, Dict
 def _clean_markdown(text: str) -> str:
     """
@@ -128,12 +131,97 @@ def print_summary(docs: List[Dict]):
     print("-" * 80)
     print(f"Total files: {len(docs)}  OK: {ok_count}  Skipped: {skipped}")
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Load and summarize markdown docs for RAG ingestion.")
-    parser.add_argument("dir", help="Directory containing markdown (.md) files")
-    parser.add_argument("--ext", default=".md", help="File extension to load")
-    parser.add_argument("--max-chars", type=int, default=20000, help="Max cleaned characters to accept (default 20k)")
     args = parser.parse_args()
-    docs = load_markdown_docs(args.dir, ext=args.ext, max_chars=args.max_chars)
-    print_summary(docs)

 # RAG-document-assistant/ingestion/load_docs.py
 """
+Document loader for RAG ingestion.
+Provides:
+- load_markdown_docs(): Legacy markdown-only loader
+- load_documents(): Unified loader (uses Docling if available, falls back to markdown)
 CLI:
+> python3 load_docs.py /full/path/to/your/docs/folder
 prints a summary table for each file and exits with code 0.
 """
 import glob
 import argparse
 import re
+import logging
+from typing import List, Dict, Optional
+logger = logging.getLogger(__name__)
 def _clean_markdown(text: str) -> str:
     """
     print("-" * 80)
     print(f"Total files: {len(docs)}  OK: {ok_count}  Skipped: {skipped}")
+# Try to import Docling loader
+DOCLING_AVAILABLE = False
+try:
+    from src.ingestion.docling_loader import (
+        load_documents_with_docling,
+        convert_to_legacy_format,
+        print_summary as docling_print_summary,
+        SUPPORTED_EXTENSIONS
+    )
+    DOCLING_AVAILABLE = True
+except ImportError:
+    SUPPORTED_EXTENSIONS = {".md", ".markdown"}
+def load_documents(
+    dir_path: str,
+    extensions: Optional[List[str]] = None,
+    max_chars: int = 50000,
+    use_docling: bool = True,
+    recursive: bool = False
+) -> List[Dict]:
+    """
+    Unified document loader - uses Docling if available, falls back to markdown.
+    Args:
+        dir_path: Path to directory containing documents
+        extensions: File extensions to process (None = all supported)
+        max_chars: Maximum characters per document
+        use_docling: Prefer Docling if available
+        recursive: Search subdirectories
+    Returns:
+        List of document dicts with text and metadata
+    """
+    if use_docling and DOCLING_AVAILABLE:
+        logger.info("Using Docling multi-format loader")
+        parsed = load_documents_with_docling(
+            dir_path,
+            extensions=extensions,
+            max_chars=max_chars,
+            recursive=recursive
+        )
+        return convert_to_legacy_format(parsed)
+    else:
+        logger.info("Using legacy markdown loader")
+        ext = ".md"
+        if extensions and len(extensions) > 0:
+            ext = extensions[0] if extensions[0].startswith(".") else f".{extensions[0]}"
+        return load_markdown_docs(dir_path, ext=ext, max_chars=max_chars)
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Load and summarize documents for RAG ingestion."
+    )
+    parser.add_argument("dir", help="Directory containing documents")
+    parser.add_argument(
+        "--ext", "-e",
+        nargs="+",
+        default=None,
+        help="File extensions to load (default: all supported)"
+    )
+    parser.add_argument(
+        "--max-chars",
+        type=int,
+        default=50000,
+        help="Max characters to accept (default 50k)"
+    )
+    parser.add_argument(
+        "--no-docling",
+        action="store_true",
+        help="Disable Docling, use markdown-only loader"
+    )
+    parser.add_argument(
+        "--recursive", "-r",
+        action="store_true",
+        help="Search subdirectories"
+    )
     args = parser.parse_args()
+    if args.no_docling or not DOCLING_AVAILABLE:
+        # Legacy markdown mode
+        ext = args.ext[0] if args.ext else ".md"
+        docs = load_markdown_docs(args.dir, ext=ext, max_chars=args.max_chars)
+        print_summary(docs)
+    else:
+        # Docling multi-format mode
+        parsed = load_documents_with_docling(
+            args.dir,
+            extensions=args.ext,
+            max_chars=args.max_chars,
+            recursive=args.recursive
+        )
+        docling_print_summary(parsed)