Spaces:
Sleeping
Sleeping
Add Docling integration for multi-format document processing
Browse files- Add docling_loader.py for PDF, DOCX, PPTX, HTML, image support
- Add structure-aware chunking (preserves tables, sections)
- Update ingestion API with use_docling and use_structure options
- Update Dockerfile with system deps for Docling
- Remove render.yaml (using HuggingFace Spaces only)
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- Dockerfile +8 -2
- render.yaml +0 -17
- requirements.txt +3 -0
- src/ingestion/api.py +78 -10
- src/ingestion/chunker.py +199 -2
- src/ingestion/docling_loader.py +364 -0
- src/ingestion/load_docs.py +100 -12
Dockerfile
CHANGED
|
@@ -2,14 +2,20 @@ FROM python:3.11-slim
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
-
# Install system dependencies
|
| 6 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
build-essential \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
|
| 10 |
# Copy requirements and install dependencies
|
| 11 |
COPY requirements.txt .
|
| 12 |
-
RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
|
| 13 |
pip install --no-cache-dir -r requirements.txt
|
| 14 |
|
| 15 |
# Copy application code
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
+
# Install system dependencies for Docling (PDF, OCR, image processing)
|
| 6 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
build-essential \
|
| 8 |
+
libgl1-mesa-glx \
|
| 9 |
+
libglib2.0-0 \
|
| 10 |
+
libsm6 \
|
| 11 |
+
libxext6 \
|
| 12 |
+
libxrender-dev \
|
| 13 |
+
libgomp1 \
|
| 14 |
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
|
| 16 |
# Copy requirements and install dependencies
|
| 17 |
COPY requirements.txt .
|
| 18 |
+
RUN pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu && \
|
| 19 |
pip install --no-cache-dir -r requirements.txt
|
| 20 |
|
| 21 |
# Copy application code
|
render.yaml
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
services:
|
| 2 |
-
- type: web
|
| 3 |
-
name: rag-api
|
| 4 |
-
runtime: python
|
| 5 |
-
buildCommand: pip install -r requirements.txt
|
| 6 |
-
startCommand: uvicorn src.api.main:app --host 0.0.0.0 --port $PORT
|
| 7 |
-
envVars:
|
| 8 |
-
- key: PYTHON_VERSION
|
| 9 |
-
value: "3.11"
|
| 10 |
-
- key: PINECONE_API_KEY
|
| 11 |
-
sync: false
|
| 12 |
-
- key: PINECONE_INDEX_NAME
|
| 13 |
-
sync: false
|
| 14 |
-
- key: GEMINI_API_KEY
|
| 15 |
-
sync: false
|
| 16 |
-
- key: GROQ_API_KEY
|
| 17 |
-
sync: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -10,3 +10,6 @@ requests>=2.31.0
|
|
| 10 |
python-dotenv>=1.0.0
|
| 11 |
rank-bm25>=0.2.2
|
| 12 |
PyPDF2>=3.0.0
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
python-dotenv>=1.0.0
|
| 11 |
rank-bm25>=0.2.2
|
| 12 |
PyPDF2>=3.0.0
|
| 13 |
+
|
| 14 |
+
# Document processing
|
| 15 |
+
docling>=2.15.0
|
src/ingestion/api.py
CHANGED
|
@@ -3,18 +3,37 @@ Ingestion API for UI integration.
|
|
| 3 |
|
| 4 |
Provides functions to ingest documents from a directory
|
| 5 |
and optionally sync to Pinecone.
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import json
|
| 9 |
import os
|
|
|
|
| 10 |
from pathlib import Path
|
| 11 |
from typing import Dict, Any, List, Optional
|
| 12 |
from dataclasses import dataclass
|
| 13 |
|
| 14 |
from src.ingestion.load_docs import load_markdown_docs
|
| 15 |
-
from src.ingestion.chunker import chunk_documents
|
| 16 |
from src.ingestion.embeddings import batch_embed_chunks
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
@dataclass
|
| 20 |
class IngestionResult:
|
|
@@ -38,7 +57,11 @@ def ingest_from_directory(
|
|
| 38 |
docs_dir: str,
|
| 39 |
output_path: str = "data/chunks.jsonl",
|
| 40 |
provider: str = "sentence-transformers",
|
| 41 |
-
dim: int = 384
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
) -> IngestionResult:
|
| 43 |
"""
|
| 44 |
Ingest documents from a directory and save to chunks.jsonl.
|
|
@@ -48,6 +71,10 @@ def ingest_from_directory(
|
|
| 48 |
output_path: Path to save chunks.jsonl
|
| 49 |
provider: Embedding provider ("sentence-transformers" or "local")
|
| 50 |
dim: Embedding dimension
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
Returns:
|
| 53 |
IngestionResult with status and counts
|
|
@@ -65,8 +92,20 @@ def ingest_from_directory(
|
|
| 65 |
)
|
| 66 |
|
| 67 |
try:
|
| 68 |
-
#
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
if not docs:
|
| 71 |
return IngestionResult(
|
| 72 |
status="warning",
|
|
@@ -77,10 +116,19 @@ def ingest_from_directory(
|
|
| 77 |
)
|
| 78 |
|
| 79 |
# Count successful loads
|
| 80 |
-
doc_count = len([d for d in docs if d.get("status") == "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
# Chunk documents
|
| 83 |
-
chunks = chunk_documents(docs, max_tokens=300, overlap=50)
|
| 84 |
if not chunks:
|
| 85 |
return IngestionResult(
|
| 86 |
status="warning",
|
|
@@ -93,12 +141,15 @@ def ingest_from_directory(
|
|
| 93 |
# Generate embeddings
|
| 94 |
embedded = batch_embed_chunks(chunks, provider=provider, dim=dim)
|
| 95 |
|
| 96 |
-
# Merge text back into embedded chunks
|
| 97 |
-
chunk_map = {(c["filename"], c["chunk_id"]): c
|
| 98 |
for e in embedded:
|
| 99 |
key = (e["filename"], e["chunk_id"])
|
| 100 |
if key in chunk_map:
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
# Save to file
|
| 104 |
save_path = Path(output_path)
|
|
@@ -112,6 +163,8 @@ def ingest_from_directory(
|
|
| 112 |
"chunk_id": e["chunk_id"],
|
| 113 |
"text": e.get("text", ""),
|
| 114 |
"chars": e.get("chars", 0),
|
|
|
|
|
|
|
| 115 |
"embedding": e["embedding"]
|
| 116 |
}
|
| 117 |
fh.write(json.dumps(obj, ensure_ascii=False) + "\n")
|
|
@@ -125,6 +178,7 @@ def ingest_from_directory(
|
|
| 125 |
)
|
| 126 |
|
| 127 |
except Exception as e:
|
|
|
|
| 128 |
return IngestionResult(
|
| 129 |
status="error",
|
| 130 |
documents=0,
|
|
@@ -239,6 +293,20 @@ def sync_to_pinecone(
|
|
| 239 |
)
|
| 240 |
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
def get_index_status(chunks_path: str = "data/chunks.jsonl") -> Dict[str, Any]:
|
| 243 |
"""
|
| 244 |
Get status of the current index.
|
|
|
|
| 3 |
|
| 4 |
Provides functions to ingest documents from a directory
|
| 5 |
and optionally sync to Pinecone.
|
| 6 |
+
|
| 7 |
+
Supports both legacy markdown-only loading and multi-format
|
| 8 |
+
loading via Docling.
|
| 9 |
"""
|
| 10 |
|
| 11 |
import json
|
| 12 |
import os
|
| 13 |
+
import logging
|
| 14 |
from pathlib import Path
|
| 15 |
from typing import Dict, Any, List, Optional
|
| 16 |
from dataclasses import dataclass
|
| 17 |
|
| 18 |
from src.ingestion.load_docs import load_markdown_docs
|
| 19 |
+
from src.ingestion.chunker import chunk_documents, chunk_documents_with_structure
|
| 20 |
from src.ingestion.embeddings import batch_embed_chunks
|
| 21 |
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Try to import Docling loader (optional dependency)
|
| 25 |
+
DOCLING_AVAILABLE = False
|
| 26 |
+
try:
|
| 27 |
+
from src.ingestion.docling_loader import (
|
| 28 |
+
load_documents_with_docling,
|
| 29 |
+
convert_to_legacy_format,
|
| 30 |
+
SUPPORTED_EXTENSIONS
|
| 31 |
+
)
|
| 32 |
+
DOCLING_AVAILABLE = True
|
| 33 |
+
except ImportError:
|
| 34 |
+
logger.info("Docling not available, using markdown-only loader")
|
| 35 |
+
SUPPORTED_EXTENSIONS = {".md", ".markdown"}
|
| 36 |
+
|
| 37 |
|
| 38 |
@dataclass
|
| 39 |
class IngestionResult:
|
|
|
|
| 57 |
docs_dir: str,
|
| 58 |
output_path: str = "data/chunks.jsonl",
|
| 59 |
provider: str = "sentence-transformers",
|
| 60 |
+
dim: int = 384,
|
| 61 |
+
use_docling: bool = True,
|
| 62 |
+
extensions: Optional[List[str]] = None,
|
| 63 |
+
use_structure: bool = True,
|
| 64 |
+
recursive: bool = False
|
| 65 |
) -> IngestionResult:
|
| 66 |
"""
|
| 67 |
Ingest documents from a directory and save to chunks.jsonl.
|
|
|
|
| 71 |
output_path: Path to save chunks.jsonl
|
| 72 |
provider: Embedding provider ("sentence-transformers" or "local")
|
| 73 |
dim: Embedding dimension
|
| 74 |
+
use_docling: Use Docling for multi-format parsing (if available)
|
| 75 |
+
extensions: File extensions to process (None = all supported)
|
| 76 |
+
use_structure: Use structure-aware chunking (requires Docling)
|
| 77 |
+
recursive: Search subdirectories recursively
|
| 78 |
|
| 79 |
Returns:
|
| 80 |
IngestionResult with status and counts
|
|
|
|
| 92 |
)
|
| 93 |
|
| 94 |
try:
|
| 95 |
+
# Choose loader based on availability and preference
|
| 96 |
+
if use_docling and DOCLING_AVAILABLE:
|
| 97 |
+
logger.info("Using Docling for multi-format document loading")
|
| 98 |
+
parsed_docs = load_documents_with_docling(
|
| 99 |
+
docs_dir,
|
| 100 |
+
extensions=extensions,
|
| 101 |
+
recursive=recursive
|
| 102 |
+
)
|
| 103 |
+
docs = convert_to_legacy_format(parsed_docs)
|
| 104 |
+
else:
|
| 105 |
+
logger.info("Using legacy markdown loader")
|
| 106 |
+
docs = load_markdown_docs(docs_dir)
|
| 107 |
+
use_structure = False # No structure without Docling
|
| 108 |
+
|
| 109 |
if not docs:
|
| 110 |
return IngestionResult(
|
| 111 |
status="warning",
|
|
|
|
| 116 |
)
|
| 117 |
|
| 118 |
# Count successful loads
|
| 119 |
+
doc_count = len([d for d in docs if d.get("status") == "OK"])
|
| 120 |
+
|
| 121 |
+
# Chunk documents (structure-aware or legacy)
|
| 122 |
+
if use_structure and DOCLING_AVAILABLE:
|
| 123 |
+
chunks = chunk_documents_with_structure(
|
| 124 |
+
docs,
|
| 125 |
+
max_tokens=300,
|
| 126 |
+
overlap=50,
|
| 127 |
+
use_structure=True
|
| 128 |
+
)
|
| 129 |
+
else:
|
| 130 |
+
chunks = chunk_documents(docs, max_tokens=300, overlap=50)
|
| 131 |
|
|
|
|
|
|
|
| 132 |
if not chunks:
|
| 133 |
return IngestionResult(
|
| 134 |
status="warning",
|
|
|
|
| 141 |
# Generate embeddings
|
| 142 |
embedded = batch_embed_chunks(chunks, provider=provider, dim=dim)
|
| 143 |
|
| 144 |
+
# Merge text and metadata back into embedded chunks
|
| 145 |
+
chunk_map = {(c["filename"], c["chunk_id"]): c for c in chunks}
|
| 146 |
for e in embedded:
|
| 147 |
key = (e["filename"], e["chunk_id"])
|
| 148 |
if key in chunk_map:
|
| 149 |
+
src = chunk_map[key]
|
| 150 |
+
e["text"] = src.get("text", "")
|
| 151 |
+
e["element_type"] = src.get("element_type", "text")
|
| 152 |
+
e["section_heading"] = src.get("section_heading", "")
|
| 153 |
|
| 154 |
# Save to file
|
| 155 |
save_path = Path(output_path)
|
|
|
|
| 163 |
"chunk_id": e["chunk_id"],
|
| 164 |
"text": e.get("text", ""),
|
| 165 |
"chars": e.get("chars", 0),
|
| 166 |
+
"element_type": e.get("element_type", "text"),
|
| 167 |
+
"section_heading": e.get("section_heading", ""),
|
| 168 |
"embedding": e["embedding"]
|
| 169 |
}
|
| 170 |
fh.write(json.dumps(obj, ensure_ascii=False) + "\n")
|
|
|
|
| 178 |
)
|
| 179 |
|
| 180 |
except Exception as e:
|
| 181 |
+
logger.exception("Ingestion failed")
|
| 182 |
return IngestionResult(
|
| 183 |
status="error",
|
| 184 |
documents=0,
|
|
|
|
| 293 |
)
|
| 294 |
|
| 295 |
|
| 296 |
+
def get_supported_formats() -> Dict[str, Any]:
|
| 297 |
+
"""
|
| 298 |
+
Get information about supported document formats.
|
| 299 |
+
|
| 300 |
+
Returns:
|
| 301 |
+
Dict with docling availability and supported extensions
|
| 302 |
+
"""
|
| 303 |
+
return {
|
| 304 |
+
"docling_available": DOCLING_AVAILABLE,
|
| 305 |
+
"supported_extensions": list(SUPPORTED_EXTENSIONS),
|
| 306 |
+
"loader": "docling" if DOCLING_AVAILABLE else "markdown-only"
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
|
| 310 |
def get_index_status(chunks_path: str = "data/chunks.jsonl") -> Dict[str, Any]:
|
| 311 |
"""
|
| 312 |
Get status of the current index.
|
src/ingestion/chunker.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
# RAG-document-assistant/ingestion/chunker.py
|
| 2 |
"""
|
| 3 |
Text chunking utility for RAG ingestion.
|
| 4 |
-
Inputs: list of docs from load_docs.py
|
| 5 |
Output: list of chunks with metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
from typing import List, Dict
|
| 9 |
|
| 10 |
def chunk_text(
|
| 11 |
text: str,
|
|
@@ -98,6 +102,199 @@ def chunk_documents(docs: List[Dict], max_tokens: int = 300, overlap: int = 50):
|
|
| 98 |
return all_chunks
|
| 99 |
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
if __name__ == "__main__":
|
| 102 |
# Minimal test
|
| 103 |
sample = "This is a test text " * 200
|
|
|
|
| 1 |
# RAG-document-assistant/ingestion/chunker.py
|
| 2 |
"""
|
| 3 |
Text chunking utility for RAG ingestion.
|
| 4 |
+
Inputs: list of docs from load_docs.py or docling_loader.py
|
| 5 |
Output: list of chunks with metadata
|
| 6 |
+
|
| 7 |
+
Supports:
|
| 8 |
+
- Simple character-based chunking (legacy)
|
| 9 |
+
- Structure-aware chunking using Docling elements
|
| 10 |
"""
|
| 11 |
|
| 12 |
+
from typing import List, Dict, Optional, Any
|
| 13 |
|
| 14 |
def chunk_text(
|
| 15 |
text: str,
|
|
|
|
| 102 |
return all_chunks
|
| 103 |
|
| 104 |
|
| 105 |
+
def chunk_by_structure(
|
| 106 |
+
elements: List[Any],
|
| 107 |
+
max_tokens: int = 300,
|
| 108 |
+
overlap: int = 50,
|
| 109 |
+
keep_tables_intact: bool = True,
|
| 110 |
+
include_heading_context: bool = True
|
| 111 |
+
) -> List[Dict]:
|
| 112 |
+
"""
|
| 113 |
+
Structure-aware chunking using Docling document elements.
|
| 114 |
+
|
| 115 |
+
Groups content by semantic boundaries (headings, tables) rather than
|
| 116 |
+
arbitrary character counts. Falls back to character-based splitting
|
| 117 |
+
for oversized elements.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
elements: List of DocumentElement objects from docling_loader
|
| 121 |
+
max_tokens: Maximum tokens per chunk (approx 4 chars/token)
|
| 122 |
+
overlap: Token overlap for split elements
|
| 123 |
+
keep_tables_intact: Keep tables as single chunks even if large
|
| 124 |
+
include_heading_context: Prepend parent heading to chunks
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
List of chunk dicts with element_type and section metadata
|
| 128 |
+
"""
|
| 129 |
+
if not elements:
|
| 130 |
+
return []
|
| 131 |
+
|
| 132 |
+
max_chars = max_tokens * 4
|
| 133 |
+
chunks = []
|
| 134 |
+
current_heading = ""
|
| 135 |
+
current_section = []
|
| 136 |
+
current_chars = 0
|
| 137 |
+
|
| 138 |
+
def flush_section():
|
| 139 |
+
"""Flush accumulated section content as a chunk."""
|
| 140 |
+
nonlocal current_section, current_chars
|
| 141 |
+
if not current_section:
|
| 142 |
+
return
|
| 143 |
+
|
| 144 |
+
combined_text = "\n\n".join(el.text for el in current_section)
|
| 145 |
+
if combined_text.strip():
|
| 146 |
+
# Prepend heading context if available
|
| 147 |
+
if include_heading_context and current_heading:
|
| 148 |
+
combined_text = f"## {current_heading}\n\n{combined_text}"
|
| 149 |
+
|
| 150 |
+
chunks.append({
|
| 151 |
+
"text": combined_text.strip(),
|
| 152 |
+
"chars": len(combined_text),
|
| 153 |
+
"element_type": "section",
|
| 154 |
+
"section_heading": current_heading,
|
| 155 |
+
"element_count": len(current_section)
|
| 156 |
+
})
|
| 157 |
+
|
| 158 |
+
current_section = []
|
| 159 |
+
current_chars = 0
|
| 160 |
+
|
| 161 |
+
for element in elements:
|
| 162 |
+
el_type = getattr(element, "element_type", "paragraph")
|
| 163 |
+
el_text = getattr(element, "text", str(element))
|
| 164 |
+
el_chars = len(el_text)
|
| 165 |
+
|
| 166 |
+
# Handle headings - start new section
|
| 167 |
+
if el_type == "heading":
|
| 168 |
+
flush_section()
|
| 169 |
+
current_heading = el_text
|
| 170 |
+
continue
|
| 171 |
+
|
| 172 |
+
# Handle tables - keep intact if configured
|
| 173 |
+
if el_type == "table" and keep_tables_intact:
|
| 174 |
+
flush_section()
|
| 175 |
+
table_text = el_text
|
| 176 |
+
if include_heading_context and current_heading:
|
| 177 |
+
table_text = f"## {current_heading}\n\n{el_text}"
|
| 178 |
+
|
| 179 |
+
chunks.append({
|
| 180 |
+
"text": table_text.strip(),
|
| 181 |
+
"chars": len(table_text),
|
| 182 |
+
"element_type": "table",
|
| 183 |
+
"section_heading": current_heading,
|
| 184 |
+
"element_count": 1
|
| 185 |
+
})
|
| 186 |
+
continue
|
| 187 |
+
|
| 188 |
+
# Check if adding this element exceeds limit
|
| 189 |
+
if current_chars + el_chars > max_chars and current_section:
|
| 190 |
+
flush_section()
|
| 191 |
+
|
| 192 |
+
# Handle oversized single elements
|
| 193 |
+
if el_chars > max_chars:
|
| 194 |
+
flush_section()
|
| 195 |
+
# Split large element using character-based chunking
|
| 196 |
+
sub_chunks = chunk_text(el_text, max_tokens=max_tokens, overlap=overlap)
|
| 197 |
+
for i, sub_text in enumerate(sub_chunks):
|
| 198 |
+
prefix = ""
|
| 199 |
+
if include_heading_context and current_heading:
|
| 200 |
+
prefix = f"## {current_heading}\n\n"
|
| 201 |
+
chunks.append({
|
| 202 |
+
"text": f"{prefix}{sub_text}".strip(),
|
| 203 |
+
"chars": len(sub_text) + len(prefix),
|
| 204 |
+
"element_type": f"{el_type}_split",
|
| 205 |
+
"section_heading": current_heading,
|
| 206 |
+
"split_index": i,
|
| 207 |
+
"element_count": 1
|
| 208 |
+
})
|
| 209 |
+
continue
|
| 210 |
+
|
| 211 |
+
# Accumulate element in current section
|
| 212 |
+
current_section.append(element)
|
| 213 |
+
current_chars += el_chars
|
| 214 |
+
|
| 215 |
+
# Flush remaining content
|
| 216 |
+
flush_section()
|
| 217 |
+
|
| 218 |
+
return chunks
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def chunk_documents_with_structure(
|
| 222 |
+
docs: List[Dict],
|
| 223 |
+
max_tokens: int = 300,
|
| 224 |
+
overlap: int = 50,
|
| 225 |
+
keep_tables_intact: bool = True,
|
| 226 |
+
use_structure: bool = True
|
| 227 |
+
) -> List[Dict]:
|
| 228 |
+
"""
|
| 229 |
+
Chunk documents using structure-aware or legacy chunking.
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
docs: List of document dicts (from docling_loader or load_docs)
|
| 233 |
+
max_tokens: Maximum tokens per chunk
|
| 234 |
+
overlap: Token overlap between chunks
|
| 235 |
+
keep_tables_intact: Keep tables as single chunks
|
| 236 |
+
use_structure: Use structure-aware chunking if elements available
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
List of chunk dicts with metadata
|
| 240 |
+
"""
|
| 241 |
+
if not isinstance(docs, list):
|
| 242 |
+
raise TypeError("docs must be a list")
|
| 243 |
+
|
| 244 |
+
all_chunks = []
|
| 245 |
+
|
| 246 |
+
for d in docs:
|
| 247 |
+
if not isinstance(d, dict):
|
| 248 |
+
raise TypeError("Each document must be a dictionary")
|
| 249 |
+
|
| 250 |
+
status = d.get("status", "")
|
| 251 |
+
if status != "OK":
|
| 252 |
+
continue
|
| 253 |
+
|
| 254 |
+
filename = d.get("filename", "unknown")
|
| 255 |
+
elements = d.get("elements", [])
|
| 256 |
+
|
| 257 |
+
# Use structure-aware chunking if elements available
|
| 258 |
+
if use_structure and elements:
|
| 259 |
+
raw_chunks = chunk_by_structure(
|
| 260 |
+
elements,
|
| 261 |
+
max_tokens=max_tokens,
|
| 262 |
+
overlap=overlap,
|
| 263 |
+
keep_tables_intact=keep_tables_intact
|
| 264 |
+
)
|
| 265 |
+
for i, ch in enumerate(raw_chunks):
|
| 266 |
+
all_chunks.append({
|
| 267 |
+
"filename": filename,
|
| 268 |
+
"chunk_id": i,
|
| 269 |
+
"text": ch["text"],
|
| 270 |
+
"chars": ch["chars"],
|
| 271 |
+
"element_type": ch.get("element_type", "section"),
|
| 272 |
+
"section_heading": ch.get("section_heading", ""),
|
| 273 |
+
"format": d.get("format", ""),
|
| 274 |
+
"page_count": d.get("page_count", 0)
|
| 275 |
+
})
|
| 276 |
+
else:
|
| 277 |
+
# Fallback to legacy text-based chunking
|
| 278 |
+
text = d.get("text", "")
|
| 279 |
+
if not text:
|
| 280 |
+
continue
|
| 281 |
+
|
| 282 |
+
raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)
|
| 283 |
+
for i, ch in enumerate(raw_chunks):
|
| 284 |
+
all_chunks.append({
|
| 285 |
+
"filename": filename,
|
| 286 |
+
"chunk_id": i,
|
| 287 |
+
"text": ch,
|
| 288 |
+
"chars": len(ch),
|
| 289 |
+
"element_type": "text",
|
| 290 |
+
"section_heading": "",
|
| 291 |
+
"format": d.get("format", ".md"),
|
| 292 |
+
"page_count": 0
|
| 293 |
+
})
|
| 294 |
+
|
| 295 |
+
return all_chunks
|
| 296 |
+
|
| 297 |
+
|
| 298 |
if __name__ == "__main__":
|
| 299 |
# Minimal test
|
| 300 |
sample = "This is a test text " * 200
|
src/ingestion/docling_loader.py
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Docling-based document loader for multi-format document processing.
|
| 3 |
+
|
| 4 |
+
Supports: PDF, DOCX, PPTX, HTML, images, and Markdown.
|
| 5 |
+
Provides structure-aware parsing with table extraction and hierarchy preservation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import glob
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import List, Dict, Optional, Any
|
| 12 |
+
from dataclasses import dataclass, field
|
| 13 |
+
import logging
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# Supported file extensions
|
| 18 |
+
SUPPORTED_EXTENSIONS = {
|
| 19 |
+
".pdf", ".docx", ".pptx", ".xlsx",
|
| 20 |
+
".html", ".htm",
|
| 21 |
+
".md", ".markdown",
|
| 22 |
+
".png", ".jpg", ".jpeg", ".tiff", ".bmp"
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class DocumentElement:
|
| 28 |
+
"""Represents a structural element in a document."""
|
| 29 |
+
element_type: str # paragraph, table, heading, list, code, image
|
| 30 |
+
text: str
|
| 31 |
+
level: int = 0 # heading level (1-6) or nesting depth
|
| 32 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class ParsedDocument:
|
| 37 |
+
"""Result of parsing a document with Docling."""
|
| 38 |
+
filename: str
|
| 39 |
+
path: str
|
| 40 |
+
elements: List[DocumentElement]
|
| 41 |
+
format: str
|
| 42 |
+
page_count: int = 0
|
| 43 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
| 44 |
+
status: str = "OK"
|
| 45 |
+
error: Optional[str] = None
|
| 46 |
+
|
| 47 |
+
@property
|
| 48 |
+
def full_text(self) -> str:
|
| 49 |
+
"""Get concatenated text from all elements."""
|
| 50 |
+
return "\n\n".join(el.text for el in self.elements if el.text.strip())
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def chars(self) -> int:
|
| 54 |
+
return len(self.full_text)
|
| 55 |
+
|
| 56 |
+
@property
|
| 57 |
+
def words(self) -> int:
|
| 58 |
+
return len(self.full_text.split())
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _get_docling_converter():
|
| 62 |
+
"""Lazy load Docling converter to avoid import overhead."""
|
| 63 |
+
try:
|
| 64 |
+
from docling.document_converter import DocumentConverter
|
| 65 |
+
return DocumentConverter()
|
| 66 |
+
except ImportError as e:
|
| 67 |
+
logger.error(f"Docling not installed: {e}")
|
| 68 |
+
raise ImportError(
|
| 69 |
+
"Docling is required for multi-format document loading. "
|
| 70 |
+
"Install with: pip install docling"
|
| 71 |
+
) from e
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _extract_elements_from_docling(doc_result) -> List[DocumentElement]:
|
| 75 |
+
"""
|
| 76 |
+
Extract structured elements from a Docling conversion result.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
doc_result: Docling ConversionResult object
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
List of DocumentElement objects
|
| 83 |
+
"""
|
| 84 |
+
elements = []
|
| 85 |
+
|
| 86 |
+
try:
|
| 87 |
+
# Get the DoclingDocument
|
| 88 |
+
docling_doc = doc_result.document
|
| 89 |
+
|
| 90 |
+
# Iterate through document items
|
| 91 |
+
for item, level in docling_doc.iterate_items():
|
| 92 |
+
item_type = item.__class__.__name__.lower()
|
| 93 |
+
|
| 94 |
+
# Map Docling item types to our element types
|
| 95 |
+
if "heading" in item_type or "title" in item_type:
|
| 96 |
+
el_type = "heading"
|
| 97 |
+
el_level = getattr(item, "level", 1)
|
| 98 |
+
elif "table" in item_type:
|
| 99 |
+
el_type = "table"
|
| 100 |
+
el_level = 0
|
| 101 |
+
elif "list" in item_type:
|
| 102 |
+
el_type = "list"
|
| 103 |
+
el_level = level
|
| 104 |
+
elif "code" in item_type:
|
| 105 |
+
el_type = "code"
|
| 106 |
+
el_level = 0
|
| 107 |
+
elif "image" in item_type or "figure" in item_type:
|
| 108 |
+
el_type = "image"
|
| 109 |
+
el_level = 0
|
| 110 |
+
else:
|
| 111 |
+
el_type = "paragraph"
|
| 112 |
+
el_level = level
|
| 113 |
+
|
| 114 |
+
# Extract text content
|
| 115 |
+
text = ""
|
| 116 |
+
if hasattr(item, "text") and item.text:
|
| 117 |
+
text = item.text
|
| 118 |
+
elif hasattr(item, "export_to_markdown"):
|
| 119 |
+
try:
|
| 120 |
+
# Some items require doc parameter
|
| 121 |
+
text = item.export_to_markdown(docling_doc)
|
| 122 |
+
except TypeError:
|
| 123 |
+
try:
|
| 124 |
+
text = item.export_to_markdown()
|
| 125 |
+
except Exception:
|
| 126 |
+
text = str(item) if hasattr(item, "__str__") else ""
|
| 127 |
+
elif hasattr(item, "__str__"):
|
| 128 |
+
text = str(item)
|
| 129 |
+
|
| 130 |
+
if text and text.strip():
|
| 131 |
+
elements.append(DocumentElement(
|
| 132 |
+
element_type=el_type,
|
| 133 |
+
text=text.strip(),
|
| 134 |
+
level=el_level,
|
| 135 |
+
metadata={
|
| 136 |
+
"original_type": item_type,
|
| 137 |
+
"depth": level
|
| 138 |
+
}
|
| 139 |
+
))
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
logger.warning(f"Error extracting elements: {e}")
|
| 143 |
+
# Fallback: try to get markdown export
|
| 144 |
+
try:
|
| 145 |
+
md_text = doc_result.document.export_to_markdown()
|
| 146 |
+
if md_text:
|
| 147 |
+
elements.append(DocumentElement(
|
| 148 |
+
element_type="paragraph",
|
| 149 |
+
text=md_text,
|
| 150 |
+
level=0
|
| 151 |
+
))
|
| 152 |
+
except Exception:
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
return elements
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def load_document_with_docling(file_path: str) -> ParsedDocument:
|
| 159 |
+
"""
|
| 160 |
+
Load a single document using Docling.
|
| 161 |
+
|
| 162 |
+
Args:
|
| 163 |
+
file_path: Path to the document file
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
ParsedDocument with extracted structure and content
|
| 167 |
+
"""
|
| 168 |
+
path = Path(file_path)
|
| 169 |
+
|
| 170 |
+
if not path.exists():
|
| 171 |
+
return ParsedDocument(
|
| 172 |
+
filename=path.name,
|
| 173 |
+
path=str(path),
|
| 174 |
+
elements=[],
|
| 175 |
+
format=path.suffix.lower(),
|
| 176 |
+
status="ERROR",
|
| 177 |
+
error=f"File not found: {file_path}"
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
ext = path.suffix.lower()
|
| 181 |
+
if ext not in SUPPORTED_EXTENSIONS:
|
| 182 |
+
return ParsedDocument(
|
| 183 |
+
filename=path.name,
|
| 184 |
+
path=str(path),
|
| 185 |
+
elements=[],
|
| 186 |
+
format=ext,
|
| 187 |
+
status="SKIPPED",
|
| 188 |
+
error=f"Unsupported format: {ext}"
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
converter = _get_docling_converter()
|
| 193 |
+
result = converter.convert(str(path))
|
| 194 |
+
|
| 195 |
+
elements = _extract_elements_from_docling(result)
|
| 196 |
+
|
| 197 |
+
# Get page count if available
|
| 198 |
+
page_count = 0
|
| 199 |
+
try:
|
| 200 |
+
if hasattr(result.document, "pages"):
|
| 201 |
+
page_count = len(result.document.pages)
|
| 202 |
+
except Exception:
|
| 203 |
+
pass
|
| 204 |
+
|
| 205 |
+
return ParsedDocument(
|
| 206 |
+
filename=path.name,
|
| 207 |
+
path=str(path),
|
| 208 |
+
elements=elements,
|
| 209 |
+
format=ext,
|
| 210 |
+
page_count=page_count,
|
| 211 |
+
metadata={
|
| 212 |
+
"converter": "docling",
|
| 213 |
+
"element_count": len(elements)
|
| 214 |
+
},
|
| 215 |
+
status="OK"
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
except Exception as e:
|
| 219 |
+
logger.error(f"Error processing {file_path}: {e}")
|
| 220 |
+
return ParsedDocument(
|
| 221 |
+
filename=path.name,
|
| 222 |
+
path=str(path),
|
| 223 |
+
elements=[],
|
| 224 |
+
format=ext,
|
| 225 |
+
status="ERROR",
|
| 226 |
+
error=str(e)
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def load_documents_with_docling(
|
| 231 |
+
dir_path: str,
|
| 232 |
+
extensions: Optional[List[str]] = None,
|
| 233 |
+
max_chars: int = 50000,
|
| 234 |
+
recursive: bool = False
|
| 235 |
+
) -> List[ParsedDocument]:
|
| 236 |
+
"""
|
| 237 |
+
Load multiple documents from a directory using Docling.
|
| 238 |
+
|
| 239 |
+
Args:
|
| 240 |
+
dir_path: Path to directory containing documents
|
| 241 |
+
extensions: List of extensions to process (default: all supported)
|
| 242 |
+
max_chars: Maximum characters per document (skip larger files)
|
| 243 |
+
recursive: Whether to search subdirectories
|
| 244 |
+
|
| 245 |
+
Returns:
|
| 246 |
+
List of ParsedDocument objects
|
| 247 |
+
"""
|
| 248 |
+
path = Path(dir_path).expanduser()
|
| 249 |
+
|
| 250 |
+
if not path.is_dir():
|
| 251 |
+
raise FileNotFoundError(f"Directory not found: {dir_path}")
|
| 252 |
+
|
| 253 |
+
if extensions is None:
|
| 254 |
+
extensions = list(SUPPORTED_EXTENSIONS)
|
| 255 |
+
else:
|
| 256 |
+
extensions = [e if e.startswith(".") else f".{e}" for e in extensions]
|
| 257 |
+
|
| 258 |
+
# Find all matching files
|
| 259 |
+
files = []
|
| 260 |
+
for ext in extensions:
|
| 261 |
+
pattern = f"**/*{ext}" if recursive else f"*{ext}"
|
| 262 |
+
files.extend(path.glob(pattern))
|
| 263 |
+
|
| 264 |
+
files = sorted(set(files))
|
| 265 |
+
|
| 266 |
+
documents = []
|
| 267 |
+
for file_path in files:
|
| 268 |
+
doc = load_document_with_docling(str(file_path))
|
| 269 |
+
|
| 270 |
+
# Check size limit
|
| 271 |
+
if doc.status == "OK" and doc.chars > max_chars:
|
| 272 |
+
doc.status = "SKIPPED_TOO_LARGE"
|
| 273 |
+
doc.error = f"Document exceeds {max_chars} chars ({doc.chars})"
|
| 274 |
+
doc.elements = []
|
| 275 |
+
|
| 276 |
+
documents.append(doc)
|
| 277 |
+
|
| 278 |
+
return documents
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def convert_to_legacy_format(docs: List[ParsedDocument]) -> List[Dict]:
|
| 282 |
+
"""
|
| 283 |
+
Convert ParsedDocument list to legacy format for backward compatibility.
|
| 284 |
+
|
| 285 |
+
Args:
|
| 286 |
+
docs: List of ParsedDocument objects
|
| 287 |
+
|
| 288 |
+
Returns:
|
| 289 |
+
List of dicts matching load_markdown_docs output format
|
| 290 |
+
"""
|
| 291 |
+
legacy = []
|
| 292 |
+
for doc in docs:
|
| 293 |
+
legacy.append({
|
| 294 |
+
"filename": doc.filename,
|
| 295 |
+
"path": doc.path,
|
| 296 |
+
"text": doc.full_text if doc.status == "OK" else None,
|
| 297 |
+
"chars": doc.chars,
|
| 298 |
+
"words": doc.words,
|
| 299 |
+
"status": doc.status,
|
| 300 |
+
"format": doc.format,
|
| 301 |
+
"elements": doc.elements, # Additional: structured elements
|
| 302 |
+
"page_count": doc.page_count,
|
| 303 |
+
"metadata": doc.metadata
|
| 304 |
+
})
|
| 305 |
+
return legacy
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def print_summary(docs: List[ParsedDocument]):
|
| 309 |
+
"""Print summary of loaded documents."""
|
| 310 |
+
if not docs:
|
| 311 |
+
print("No documents found or all were skipped.")
|
| 312 |
+
return
|
| 313 |
+
|
| 314 |
+
print(f"{'FILENAME':40} {'FORMAT':8} {'STATUS':20} {'CHARS':>8} {'ELEMENTS':>8}")
|
| 315 |
+
print("-" * 90)
|
| 316 |
+
|
| 317 |
+
for d in docs:
|
| 318 |
+
name = d.filename[:40]
|
| 319 |
+
fmt = d.format[:8]
|
| 320 |
+
status = d.status[:20]
|
| 321 |
+
chars = d.chars
|
| 322 |
+
elements = len(d.elements)
|
| 323 |
+
print(f"{name:40} {fmt:8} {status:20} {chars:8d} {elements:8d}")
|
| 324 |
+
|
| 325 |
+
ok_count = sum(1 for d in docs if d.status == "OK")
|
| 326 |
+
skipped = len(docs) - ok_count
|
| 327 |
+
print("-" * 90)
|
| 328 |
+
print(f"Total: {len(docs)} OK: {ok_count} Skipped/Errors: {skipped}")
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
if __name__ == "__main__":
|
| 332 |
+
import argparse
|
| 333 |
+
|
| 334 |
+
parser = argparse.ArgumentParser(
|
| 335 |
+
description="Load documents using Docling for RAG ingestion."
|
| 336 |
+
)
|
| 337 |
+
parser.add_argument("dir", help="Directory containing documents")
|
| 338 |
+
parser.add_argument(
|
| 339 |
+
"--extensions", "-e",
|
| 340 |
+
nargs="+",
|
| 341 |
+
default=None,
|
| 342 |
+
help="File extensions to process (default: all supported)"
|
| 343 |
+
)
|
| 344 |
+
parser.add_argument(
|
| 345 |
+
"--max-chars",
|
| 346 |
+
type=int,
|
| 347 |
+
default=50000,
|
| 348 |
+
help="Max characters to accept (default: 50000)"
|
| 349 |
+
)
|
| 350 |
+
parser.add_argument(
|
| 351 |
+
"--recursive", "-r",
|
| 352 |
+
action="store_true",
|
| 353 |
+
help="Search subdirectories recursively"
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
args = parser.parse_args()
|
| 357 |
+
|
| 358 |
+
docs = load_documents_with_docling(
|
| 359 |
+
args.dir,
|
| 360 |
+
extensions=args.extensions,
|
| 361 |
+
max_chars=args.max_chars,
|
| 362 |
+
recursive=args.recursive
|
| 363 |
+
)
|
| 364 |
+
print_summary(docs)
|
src/ingestion/load_docs.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
# RAG-document-assistant/ingestion/load_docs.py
|
| 2 |
"""
|
| 3 |
-
|
| 4 |
|
| 5 |
-
|
| 6 |
-
- load_markdown_docs(
|
| 7 |
-
|
| 8 |
|
| 9 |
CLI:
|
| 10 |
-
> python3 load_docs.py /full/path/to/your/
|
| 11 |
prints a summary table for each file and exits with code 0.
|
| 12 |
"""
|
| 13 |
|
|
@@ -15,7 +15,10 @@ import os
|
|
| 15 |
import glob
|
| 16 |
import argparse
|
| 17 |
import re
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def _clean_markdown(text: str) -> str:
|
| 21 |
"""
|
|
@@ -128,12 +131,97 @@ def print_summary(docs: List[Dict]):
|
|
| 128 |
print("-" * 80)
|
| 129 |
print(f"Total files: {len(docs)} OK: {ok_count} Skipped: {skipped}")
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
if __name__ == "__main__":
|
| 132 |
-
parser = argparse.ArgumentParser(
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
parser.add_argument("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
args = parser.parse_args()
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# RAG-document-assistant/ingestion/load_docs.py
|
| 2 |
"""
|
| 3 |
+
Document loader for RAG ingestion.
|
| 4 |
|
| 5 |
+
Provides:
|
| 6 |
+
- load_markdown_docs(): Legacy markdown-only loader
|
| 7 |
+
- load_documents(): Unified loader (uses Docling if available, falls back to markdown)
|
| 8 |
|
| 9 |
CLI:
|
| 10 |
+
> python3 load_docs.py /full/path/to/your/docs/folder
|
| 11 |
prints a summary table for each file and exits with code 0.
|
| 12 |
"""
|
| 13 |
|
|
|
|
| 15 |
import glob
|
| 16 |
import argparse
|
| 17 |
import re
|
| 18 |
+
import logging
|
| 19 |
+
from typing import List, Dict, Optional
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
|
| 23 |
def _clean_markdown(text: str) -> str:
|
| 24 |
"""
|
|
|
|
| 131 |
print("-" * 80)
|
| 132 |
print(f"Total files: {len(docs)} OK: {ok_count} Skipped: {skipped}")
|
| 133 |
|
| 134 |
+
# Try to import Docling loader
|
| 135 |
+
DOCLING_AVAILABLE = False
|
| 136 |
+
try:
|
| 137 |
+
from src.ingestion.docling_loader import (
|
| 138 |
+
load_documents_with_docling,
|
| 139 |
+
convert_to_legacy_format,
|
| 140 |
+
print_summary as docling_print_summary,
|
| 141 |
+
SUPPORTED_EXTENSIONS
|
| 142 |
+
)
|
| 143 |
+
DOCLING_AVAILABLE = True
|
| 144 |
+
except ImportError:
|
| 145 |
+
SUPPORTED_EXTENSIONS = {".md", ".markdown"}
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def load_documents(
|
| 149 |
+
dir_path: str,
|
| 150 |
+
extensions: Optional[List[str]] = None,
|
| 151 |
+
max_chars: int = 50000,
|
| 152 |
+
use_docling: bool = True,
|
| 153 |
+
recursive: bool = False
|
| 154 |
+
) -> List[Dict]:
|
| 155 |
+
"""
|
| 156 |
+
Unified document loader - uses Docling if available, falls back to markdown.
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
dir_path: Path to directory containing documents
|
| 160 |
+
extensions: File extensions to process (None = all supported)
|
| 161 |
+
max_chars: Maximum characters per document
|
| 162 |
+
use_docling: Prefer Docling if available
|
| 163 |
+
recursive: Search subdirectories
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
List of document dicts with text and metadata
|
| 167 |
+
"""
|
| 168 |
+
if use_docling and DOCLING_AVAILABLE:
|
| 169 |
+
logger.info("Using Docling multi-format loader")
|
| 170 |
+
parsed = load_documents_with_docling(
|
| 171 |
+
dir_path,
|
| 172 |
+
extensions=extensions,
|
| 173 |
+
max_chars=max_chars,
|
| 174 |
+
recursive=recursive
|
| 175 |
+
)
|
| 176 |
+
return convert_to_legacy_format(parsed)
|
| 177 |
+
else:
|
| 178 |
+
logger.info("Using legacy markdown loader")
|
| 179 |
+
ext = ".md"
|
| 180 |
+
if extensions and len(extensions) > 0:
|
| 181 |
+
ext = extensions[0] if extensions[0].startswith(".") else f".{extensions[0]}"
|
| 182 |
+
return load_markdown_docs(dir_path, ext=ext, max_chars=max_chars)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
if __name__ == "__main__":
|
| 186 |
+
parser = argparse.ArgumentParser(
|
| 187 |
+
description="Load and summarize documents for RAG ingestion."
|
| 188 |
+
)
|
| 189 |
+
parser.add_argument("dir", help="Directory containing documents")
|
| 190 |
+
parser.add_argument(
|
| 191 |
+
"--ext", "-e",
|
| 192 |
+
nargs="+",
|
| 193 |
+
default=None,
|
| 194 |
+
help="File extensions to load (default: all supported)"
|
| 195 |
+
)
|
| 196 |
+
parser.add_argument(
|
| 197 |
+
"--max-chars",
|
| 198 |
+
type=int,
|
| 199 |
+
default=50000,
|
| 200 |
+
help="Max characters to accept (default 50k)"
|
| 201 |
+
)
|
| 202 |
+
parser.add_argument(
|
| 203 |
+
"--no-docling",
|
| 204 |
+
action="store_true",
|
| 205 |
+
help="Disable Docling, use markdown-only loader"
|
| 206 |
+
)
|
| 207 |
+
parser.add_argument(
|
| 208 |
+
"--recursive", "-r",
|
| 209 |
+
action="store_true",
|
| 210 |
+
help="Search subdirectories"
|
| 211 |
+
)
|
| 212 |
args = parser.parse_args()
|
| 213 |
|
| 214 |
+
if args.no_docling or not DOCLING_AVAILABLE:
|
| 215 |
+
# Legacy markdown mode
|
| 216 |
+
ext = args.ext[0] if args.ext else ".md"
|
| 217 |
+
docs = load_markdown_docs(args.dir, ext=ext, max_chars=args.max_chars)
|
| 218 |
+
print_summary(docs)
|
| 219 |
+
else:
|
| 220 |
+
# Docling multi-format mode
|
| 221 |
+
parsed = load_documents_with_docling(
|
| 222 |
+
args.dir,
|
| 223 |
+
extensions=args.ext,
|
| 224 |
+
max_chars=args.max_chars,
|
| 225 |
+
recursive=args.recursive
|
| 226 |
+
)
|
| 227 |
+
docling_print_summary(parsed)
|