vn6295337 Claude Opus 4.5 commited on
Commit
7e07738
·
1 Parent(s): f866820

Add Docling integration for multi-format document processing

Browse files

- Add docling_loader.py for PDF, DOCX, PPTX, HTML, image support
- Add structure-aware chunking (preserves tables, sections)
- Update ingestion API with use_docling and use_structure options
- Update Dockerfile with system deps for Docling
- Remove render.yaml (using HuggingFace Spaces only)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Dockerfile CHANGED
@@ -2,14 +2,20 @@ FROM python:3.11-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install system dependencies
6
  RUN apt-get update && apt-get install -y --no-install-recommends \
7
  build-essential \
 
 
 
 
 
 
8
  && rm -rf /var/lib/apt/lists/*
9
 
10
  # Copy requirements and install dependencies
11
  COPY requirements.txt .
12
- RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
13
  pip install --no-cache-dir -r requirements.txt
14
 
15
  # Copy application code
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install system dependencies for Docling (PDF, OCR, image processing)
6
  RUN apt-get update && apt-get install -y --no-install-recommends \
7
  build-essential \
8
+ libgl1-mesa-glx \
9
+ libglib2.0-0 \
10
+ libsm6 \
11
+ libxext6 \
12
+ libxrender-dev \
13
+ libgomp1 \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
  # Copy requirements and install dependencies
17
  COPY requirements.txt .
18
+ RUN pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu && \
19
  pip install --no-cache-dir -r requirements.txt
20
 
21
  # Copy application code
render.yaml DELETED
@@ -1,17 +0,0 @@
1
- services:
2
- - type: web
3
- name: rag-api
4
- runtime: python
5
- buildCommand: pip install -r requirements.txt
6
- startCommand: uvicorn src.api.main:app --host 0.0.0.0 --port $PORT
7
- envVars:
8
- - key: PYTHON_VERSION
9
- value: "3.11"
10
- - key: PINECONE_API_KEY
11
- sync: false
12
- - key: PINECONE_INDEX_NAME
13
- sync: false
14
- - key: GEMINI_API_KEY
15
- sync: false
16
- - key: GROQ_API_KEY
17
- sync: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -10,3 +10,6 @@ requests>=2.31.0
10
  python-dotenv>=1.0.0
11
  rank-bm25>=0.2.2
12
  PyPDF2>=3.0.0
 
 
 
 
10
  python-dotenv>=1.0.0
11
  rank-bm25>=0.2.2
12
  PyPDF2>=3.0.0
13
+
14
+ # Document processing
15
+ docling>=2.15.0
src/ingestion/api.py CHANGED
@@ -3,18 +3,37 @@ Ingestion API for UI integration.
3
 
4
  Provides functions to ingest documents from a directory
5
  and optionally sync to Pinecone.
 
 
 
6
  """
7
 
8
  import json
9
  import os
 
10
  from pathlib import Path
11
  from typing import Dict, Any, List, Optional
12
  from dataclasses import dataclass
13
 
14
  from src.ingestion.load_docs import load_markdown_docs
15
- from src.ingestion.chunker import chunk_documents
16
  from src.ingestion.embeddings import batch_embed_chunks
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  @dataclass
20
  class IngestionResult:
@@ -38,7 +57,11 @@ def ingest_from_directory(
38
  docs_dir: str,
39
  output_path: str = "data/chunks.jsonl",
40
  provider: str = "sentence-transformers",
41
- dim: int = 384
 
 
 
 
42
  ) -> IngestionResult:
43
  """
44
  Ingest documents from a directory and save to chunks.jsonl.
@@ -48,6 +71,10 @@ def ingest_from_directory(
48
  output_path: Path to save chunks.jsonl
49
  provider: Embedding provider ("sentence-transformers" or "local")
50
  dim: Embedding dimension
 
 
 
 
51
 
52
  Returns:
53
  IngestionResult with status and counts
@@ -65,8 +92,20 @@ def ingest_from_directory(
65
  )
66
 
67
  try:
68
- # Load documents
69
- docs = load_markdown_docs(docs_dir)
 
 
 
 
 
 
 
 
 
 
 
 
70
  if not docs:
71
  return IngestionResult(
72
  status="warning",
@@ -77,10 +116,19 @@ def ingest_from_directory(
77
  )
78
 
79
  # Count successful loads
80
- doc_count = len([d for d in docs if d.get("status") == "ok"])
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- # Chunk documents
83
- chunks = chunk_documents(docs, max_tokens=300, overlap=50)
84
  if not chunks:
85
  return IngestionResult(
86
  status="warning",
@@ -93,12 +141,15 @@ def ingest_from_directory(
93
  # Generate embeddings
94
  embedded = batch_embed_chunks(chunks, provider=provider, dim=dim)
95
 
96
- # Merge text back into embedded chunks
97
- chunk_map = {(c["filename"], c["chunk_id"]): c["text"] for c in chunks}
98
  for e in embedded:
99
  key = (e["filename"], e["chunk_id"])
100
  if key in chunk_map:
101
- e["text"] = chunk_map[key]
 
 
 
102
 
103
  # Save to file
104
  save_path = Path(output_path)
@@ -112,6 +163,8 @@ def ingest_from_directory(
112
  "chunk_id": e["chunk_id"],
113
  "text": e.get("text", ""),
114
  "chars": e.get("chars", 0),
 
 
115
  "embedding": e["embedding"]
116
  }
117
  fh.write(json.dumps(obj, ensure_ascii=False) + "\n")
@@ -125,6 +178,7 @@ def ingest_from_directory(
125
  )
126
 
127
  except Exception as e:
 
128
  return IngestionResult(
129
  status="error",
130
  documents=0,
@@ -239,6 +293,20 @@ def sync_to_pinecone(
239
  )
240
 
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  def get_index_status(chunks_path: str = "data/chunks.jsonl") -> Dict[str, Any]:
243
  """
244
  Get status of the current index.
 
3
 
4
  Provides functions to ingest documents from a directory
5
  and optionally sync to Pinecone.
6
+
7
+ Supports both legacy markdown-only loading and multi-format
8
+ loading via Docling.
9
  """
10
 
11
  import json
12
  import os
13
+ import logging
14
  from pathlib import Path
15
  from typing import Dict, Any, List, Optional
16
  from dataclasses import dataclass
17
 
18
  from src.ingestion.load_docs import load_markdown_docs
19
+ from src.ingestion.chunker import chunk_documents, chunk_documents_with_structure
20
  from src.ingestion.embeddings import batch_embed_chunks
21
 
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Try to import Docling loader (optional dependency)
25
+ DOCLING_AVAILABLE = False
26
+ try:
27
+ from src.ingestion.docling_loader import (
28
+ load_documents_with_docling,
29
+ convert_to_legacy_format,
30
+ SUPPORTED_EXTENSIONS
31
+ )
32
+ DOCLING_AVAILABLE = True
33
+ except ImportError:
34
+ logger.info("Docling not available, using markdown-only loader")
35
+ SUPPORTED_EXTENSIONS = {".md", ".markdown"}
36
+
37
 
38
  @dataclass
39
  class IngestionResult:
 
57
  docs_dir: str,
58
  output_path: str = "data/chunks.jsonl",
59
  provider: str = "sentence-transformers",
60
+ dim: int = 384,
61
+ use_docling: bool = True,
62
+ extensions: Optional[List[str]] = None,
63
+ use_structure: bool = True,
64
+ recursive: bool = False
65
  ) -> IngestionResult:
66
  """
67
  Ingest documents from a directory and save to chunks.jsonl.
 
71
  output_path: Path to save chunks.jsonl
72
  provider: Embedding provider ("sentence-transformers" or "local")
73
  dim: Embedding dimension
74
+ use_docling: Use Docling for multi-format parsing (if available)
75
+ extensions: File extensions to process (None = all supported)
76
+ use_structure: Use structure-aware chunking (requires Docling)
77
+ recursive: Search subdirectories recursively
78
 
79
  Returns:
80
  IngestionResult with status and counts
 
92
  )
93
 
94
  try:
95
+ # Choose loader based on availability and preference
96
+ if use_docling and DOCLING_AVAILABLE:
97
+ logger.info("Using Docling for multi-format document loading")
98
+ parsed_docs = load_documents_with_docling(
99
+ docs_dir,
100
+ extensions=extensions,
101
+ recursive=recursive
102
+ )
103
+ docs = convert_to_legacy_format(parsed_docs)
104
+ else:
105
+ logger.info("Using legacy markdown loader")
106
+ docs = load_markdown_docs(docs_dir)
107
+ use_structure = False # No structure without Docling
108
+
109
  if not docs:
110
  return IngestionResult(
111
  status="warning",
 
116
  )
117
 
118
  # Count successful loads
119
+ doc_count = len([d for d in docs if d.get("status") == "OK"])
120
+
121
+ # Chunk documents (structure-aware or legacy)
122
+ if use_structure and DOCLING_AVAILABLE:
123
+ chunks = chunk_documents_with_structure(
124
+ docs,
125
+ max_tokens=300,
126
+ overlap=50,
127
+ use_structure=True
128
+ )
129
+ else:
130
+ chunks = chunk_documents(docs, max_tokens=300, overlap=50)
131
 
 
 
132
  if not chunks:
133
  return IngestionResult(
134
  status="warning",
 
141
  # Generate embeddings
142
  embedded = batch_embed_chunks(chunks, provider=provider, dim=dim)
143
 
144
+ # Merge text and metadata back into embedded chunks
145
+ chunk_map = {(c["filename"], c["chunk_id"]): c for c in chunks}
146
  for e in embedded:
147
  key = (e["filename"], e["chunk_id"])
148
  if key in chunk_map:
149
+ src = chunk_map[key]
150
+ e["text"] = src.get("text", "")
151
+ e["element_type"] = src.get("element_type", "text")
152
+ e["section_heading"] = src.get("section_heading", "")
153
 
154
  # Save to file
155
  save_path = Path(output_path)
 
163
  "chunk_id": e["chunk_id"],
164
  "text": e.get("text", ""),
165
  "chars": e.get("chars", 0),
166
+ "element_type": e.get("element_type", "text"),
167
+ "section_heading": e.get("section_heading", ""),
168
  "embedding": e["embedding"]
169
  }
170
  fh.write(json.dumps(obj, ensure_ascii=False) + "\n")
 
178
  )
179
 
180
  except Exception as e:
181
+ logger.exception("Ingestion failed")
182
  return IngestionResult(
183
  status="error",
184
  documents=0,
 
293
  )
294
 
295
 
296
+ def get_supported_formats() -> Dict[str, Any]:
297
+ """
298
+ Get information about supported document formats.
299
+
300
+ Returns:
301
+ Dict with docling availability and supported extensions
302
+ """
303
+ return {
304
+ "docling_available": DOCLING_AVAILABLE,
305
+ "supported_extensions": list(SUPPORTED_EXTENSIONS),
306
+ "loader": "docling" if DOCLING_AVAILABLE else "markdown-only"
307
+ }
308
+
309
+
310
  def get_index_status(chunks_path: str = "data/chunks.jsonl") -> Dict[str, Any]:
311
  """
312
  Get status of the current index.
src/ingestion/chunker.py CHANGED
@@ -1,11 +1,15 @@
1
  # RAG-document-assistant/ingestion/chunker.py
2
  """
3
  Text chunking utility for RAG ingestion.
4
- Inputs: list of docs from load_docs.py
5
  Output: list of chunks with metadata
 
 
 
 
6
  """
7
 
8
- from typing import List, Dict
9
 
10
  def chunk_text(
11
  text: str,
@@ -98,6 +102,199 @@ def chunk_documents(docs: List[Dict], max_tokens: int = 300, overlap: int = 50):
98
  return all_chunks
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  if __name__ == "__main__":
102
  # Minimal test
103
  sample = "This is a test text " * 200
 
1
  # RAG-document-assistant/ingestion/chunker.py
2
  """
3
  Text chunking utility for RAG ingestion.
4
+ Inputs: list of docs from load_docs.py or docling_loader.py
5
  Output: list of chunks with metadata
6
+
7
+ Supports:
8
+ - Simple character-based chunking (legacy)
9
+ - Structure-aware chunking using Docling elements
10
  """
11
 
12
+ from typing import List, Dict, Optional, Any
13
 
14
  def chunk_text(
15
  text: str,
 
102
  return all_chunks
103
 
104
 
105
+ def chunk_by_structure(
106
+ elements: List[Any],
107
+ max_tokens: int = 300,
108
+ overlap: int = 50,
109
+ keep_tables_intact: bool = True,
110
+ include_heading_context: bool = True
111
+ ) -> List[Dict]:
112
+ """
113
+ Structure-aware chunking using Docling document elements.
114
+
115
+ Groups content by semantic boundaries (headings, tables) rather than
116
+ arbitrary character counts. Falls back to character-based splitting
117
+ for oversized elements.
118
+
119
+ Args:
120
+ elements: List of DocumentElement objects from docling_loader
121
+ max_tokens: Maximum tokens per chunk (approx 4 chars/token)
122
+ overlap: Token overlap for split elements
123
+ keep_tables_intact: Keep tables as single chunks even if large
124
+ include_heading_context: Prepend parent heading to chunks
125
+
126
+ Returns:
127
+ List of chunk dicts with element_type and section metadata
128
+ """
129
+ if not elements:
130
+ return []
131
+
132
+ max_chars = max_tokens * 4
133
+ chunks = []
134
+ current_heading = ""
135
+ current_section = []
136
+ current_chars = 0
137
+
138
+ def flush_section():
139
+ """Flush accumulated section content as a chunk."""
140
+ nonlocal current_section, current_chars
141
+ if not current_section:
142
+ return
143
+
144
+ combined_text = "\n\n".join(el.text for el in current_section)
145
+ if combined_text.strip():
146
+ # Prepend heading context if available
147
+ if include_heading_context and current_heading:
148
+ combined_text = f"## {current_heading}\n\n{combined_text}"
149
+
150
+ chunks.append({
151
+ "text": combined_text.strip(),
152
+ "chars": len(combined_text),
153
+ "element_type": "section",
154
+ "section_heading": current_heading,
155
+ "element_count": len(current_section)
156
+ })
157
+
158
+ current_section = []
159
+ current_chars = 0
160
+
161
+ for element in elements:
162
+ el_type = getattr(element, "element_type", "paragraph")
163
+ el_text = getattr(element, "text", str(element))
164
+ el_chars = len(el_text)
165
+
166
+ # Handle headings - start new section
167
+ if el_type == "heading":
168
+ flush_section()
169
+ current_heading = el_text
170
+ continue
171
+
172
+ # Handle tables - keep intact if configured
173
+ if el_type == "table" and keep_tables_intact:
174
+ flush_section()
175
+ table_text = el_text
176
+ if include_heading_context and current_heading:
177
+ table_text = f"## {current_heading}\n\n{el_text}"
178
+
179
+ chunks.append({
180
+ "text": table_text.strip(),
181
+ "chars": len(table_text),
182
+ "element_type": "table",
183
+ "section_heading": current_heading,
184
+ "element_count": 1
185
+ })
186
+ continue
187
+
188
+ # Check if adding this element exceeds limit
189
+ if current_chars + el_chars > max_chars and current_section:
190
+ flush_section()
191
+
192
+ # Handle oversized single elements
193
+ if el_chars > max_chars:
194
+ flush_section()
195
+ # Split large element using character-based chunking
196
+ sub_chunks = chunk_text(el_text, max_tokens=max_tokens, overlap=overlap)
197
+ for i, sub_text in enumerate(sub_chunks):
198
+ prefix = ""
199
+ if include_heading_context and current_heading:
200
+ prefix = f"## {current_heading}\n\n"
201
+ chunks.append({
202
+ "text": f"{prefix}{sub_text}".strip(),
203
+ "chars": len(sub_text) + len(prefix),
204
+ "element_type": f"{el_type}_split",
205
+ "section_heading": current_heading,
206
+ "split_index": i,
207
+ "element_count": 1
208
+ })
209
+ continue
210
+
211
+ # Accumulate element in current section
212
+ current_section.append(element)
213
+ current_chars += el_chars
214
+
215
+ # Flush remaining content
216
+ flush_section()
217
+
218
+ return chunks
219
+
220
+
221
+ def chunk_documents_with_structure(
222
+ docs: List[Dict],
223
+ max_tokens: int = 300,
224
+ overlap: int = 50,
225
+ keep_tables_intact: bool = True,
226
+ use_structure: bool = True
227
+ ) -> List[Dict]:
228
+ """
229
+ Chunk documents using structure-aware or legacy chunking.
230
+
231
+ Args:
232
+ docs: List of document dicts (from docling_loader or load_docs)
233
+ max_tokens: Maximum tokens per chunk
234
+ overlap: Token overlap between chunks
235
+ keep_tables_intact: Keep tables as single chunks
236
+ use_structure: Use structure-aware chunking if elements available
237
+
238
+ Returns:
239
+ List of chunk dicts with metadata
240
+ """
241
+ if not isinstance(docs, list):
242
+ raise TypeError("docs must be a list")
243
+
244
+ all_chunks = []
245
+
246
+ for d in docs:
247
+ if not isinstance(d, dict):
248
+ raise TypeError("Each document must be a dictionary")
249
+
250
+ status = d.get("status", "")
251
+ if status != "OK":
252
+ continue
253
+
254
+ filename = d.get("filename", "unknown")
255
+ elements = d.get("elements", [])
256
+
257
+ # Use structure-aware chunking if elements available
258
+ if use_structure and elements:
259
+ raw_chunks = chunk_by_structure(
260
+ elements,
261
+ max_tokens=max_tokens,
262
+ overlap=overlap,
263
+ keep_tables_intact=keep_tables_intact
264
+ )
265
+ for i, ch in enumerate(raw_chunks):
266
+ all_chunks.append({
267
+ "filename": filename,
268
+ "chunk_id": i,
269
+ "text": ch["text"],
270
+ "chars": ch["chars"],
271
+ "element_type": ch.get("element_type", "section"),
272
+ "section_heading": ch.get("section_heading", ""),
273
+ "format": d.get("format", ""),
274
+ "page_count": d.get("page_count", 0)
275
+ })
276
+ else:
277
+ # Fallback to legacy text-based chunking
278
+ text = d.get("text", "")
279
+ if not text:
280
+ continue
281
+
282
+ raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)
283
+ for i, ch in enumerate(raw_chunks):
284
+ all_chunks.append({
285
+ "filename": filename,
286
+ "chunk_id": i,
287
+ "text": ch,
288
+ "chars": len(ch),
289
+ "element_type": "text",
290
+ "section_heading": "",
291
+ "format": d.get("format", ".md"),
292
+ "page_count": 0
293
+ })
294
+
295
+ return all_chunks
296
+
297
+
298
  if __name__ == "__main__":
299
  # Minimal test
300
  sample = "This is a test text " * 200
src/ingestion/docling_loader.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Docling-based document loader for multi-format document processing.
3
+
4
+ Supports: PDF, DOCX, PPTX, HTML, images, and Markdown.
5
+ Provides structure-aware parsing with table extraction and hierarchy preservation.
6
+ """
7
+
8
+ import os
9
+ import glob
10
+ from pathlib import Path
11
+ from typing import List, Dict, Optional, Any
12
+ from dataclasses import dataclass, field
13
+ import logging
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Supported file extensions
18
+ SUPPORTED_EXTENSIONS = {
19
+ ".pdf", ".docx", ".pptx", ".xlsx",
20
+ ".html", ".htm",
21
+ ".md", ".markdown",
22
+ ".png", ".jpg", ".jpeg", ".tiff", ".bmp"
23
+ }
24
+
25
+
26
+ @dataclass
27
+ class DocumentElement:
28
+ """Represents a structural element in a document."""
29
+ element_type: str # paragraph, table, heading, list, code, image
30
+ text: str
31
+ level: int = 0 # heading level (1-6) or nesting depth
32
+ metadata: Dict[str, Any] = field(default_factory=dict)
33
+
34
+
35
+ @dataclass
36
+ class ParsedDocument:
37
+ """Result of parsing a document with Docling."""
38
+ filename: str
39
+ path: str
40
+ elements: List[DocumentElement]
41
+ format: str
42
+ page_count: int = 0
43
+ metadata: Dict[str, Any] = field(default_factory=dict)
44
+ status: str = "OK"
45
+ error: Optional[str] = None
46
+
47
+ @property
48
+ def full_text(self) -> str:
49
+ """Get concatenated text from all elements."""
50
+ return "\n\n".join(el.text for el in self.elements if el.text.strip())
51
+
52
+ @property
53
+ def chars(self) -> int:
54
+ return len(self.full_text)
55
+
56
+ @property
57
+ def words(self) -> int:
58
+ return len(self.full_text.split())
59
+
60
+
61
+ def _get_docling_converter():
62
+ """Lazy load Docling converter to avoid import overhead."""
63
+ try:
64
+ from docling.document_converter import DocumentConverter
65
+ return DocumentConverter()
66
+ except ImportError as e:
67
+ logger.error(f"Docling not installed: {e}")
68
+ raise ImportError(
69
+ "Docling is required for multi-format document loading. "
70
+ "Install with: pip install docling"
71
+ ) from e
72
+
73
+
74
+ def _extract_elements_from_docling(doc_result) -> List[DocumentElement]:
75
+ """
76
+ Extract structured elements from a Docling conversion result.
77
+
78
+ Args:
79
+ doc_result: Docling ConversionResult object
80
+
81
+ Returns:
82
+ List of DocumentElement objects
83
+ """
84
+ elements = []
85
+
86
+ try:
87
+ # Get the DoclingDocument
88
+ docling_doc = doc_result.document
89
+
90
+ # Iterate through document items
91
+ for item, level in docling_doc.iterate_items():
92
+ item_type = item.__class__.__name__.lower()
93
+
94
+ # Map Docling item types to our element types
95
+ if "heading" in item_type or "title" in item_type:
96
+ el_type = "heading"
97
+ el_level = getattr(item, "level", 1)
98
+ elif "table" in item_type:
99
+ el_type = "table"
100
+ el_level = 0
101
+ elif "list" in item_type:
102
+ el_type = "list"
103
+ el_level = level
104
+ elif "code" in item_type:
105
+ el_type = "code"
106
+ el_level = 0
107
+ elif "image" in item_type or "figure" in item_type:
108
+ el_type = "image"
109
+ el_level = 0
110
+ else:
111
+ el_type = "paragraph"
112
+ el_level = level
113
+
114
+ # Extract text content
115
+ text = ""
116
+ if hasattr(item, "text") and item.text:
117
+ text = item.text
118
+ elif hasattr(item, "export_to_markdown"):
119
+ try:
120
+ # Some items require doc parameter
121
+ text = item.export_to_markdown(docling_doc)
122
+ except TypeError:
123
+ try:
124
+ text = item.export_to_markdown()
125
+ except Exception:
126
+ text = str(item) if hasattr(item, "__str__") else ""
127
+ elif hasattr(item, "__str__"):
128
+ text = str(item)
129
+
130
+ if text and text.strip():
131
+ elements.append(DocumentElement(
132
+ element_type=el_type,
133
+ text=text.strip(),
134
+ level=el_level,
135
+ metadata={
136
+ "original_type": item_type,
137
+ "depth": level
138
+ }
139
+ ))
140
+
141
+ except Exception as e:
142
+ logger.warning(f"Error extracting elements: {e}")
143
+ # Fallback: try to get markdown export
144
+ try:
145
+ md_text = doc_result.document.export_to_markdown()
146
+ if md_text:
147
+ elements.append(DocumentElement(
148
+ element_type="paragraph",
149
+ text=md_text,
150
+ level=0
151
+ ))
152
+ except Exception:
153
+ pass
154
+
155
+ return elements
156
+
157
+
158
+ def load_document_with_docling(file_path: str) -> ParsedDocument:
159
+ """
160
+ Load a single document using Docling.
161
+
162
+ Args:
163
+ file_path: Path to the document file
164
+
165
+ Returns:
166
+ ParsedDocument with extracted structure and content
167
+ """
168
+ path = Path(file_path)
169
+
170
+ if not path.exists():
171
+ return ParsedDocument(
172
+ filename=path.name,
173
+ path=str(path),
174
+ elements=[],
175
+ format=path.suffix.lower(),
176
+ status="ERROR",
177
+ error=f"File not found: {file_path}"
178
+ )
179
+
180
+ ext = path.suffix.lower()
181
+ if ext not in SUPPORTED_EXTENSIONS:
182
+ return ParsedDocument(
183
+ filename=path.name,
184
+ path=str(path),
185
+ elements=[],
186
+ format=ext,
187
+ status="SKIPPED",
188
+ error=f"Unsupported format: {ext}"
189
+ )
190
+
191
+ try:
192
+ converter = _get_docling_converter()
193
+ result = converter.convert(str(path))
194
+
195
+ elements = _extract_elements_from_docling(result)
196
+
197
+ # Get page count if available
198
+ page_count = 0
199
+ try:
200
+ if hasattr(result.document, "pages"):
201
+ page_count = len(result.document.pages)
202
+ except Exception:
203
+ pass
204
+
205
+ return ParsedDocument(
206
+ filename=path.name,
207
+ path=str(path),
208
+ elements=elements,
209
+ format=ext,
210
+ page_count=page_count,
211
+ metadata={
212
+ "converter": "docling",
213
+ "element_count": len(elements)
214
+ },
215
+ status="OK"
216
+ )
217
+
218
+ except Exception as e:
219
+ logger.error(f"Error processing {file_path}: {e}")
220
+ return ParsedDocument(
221
+ filename=path.name,
222
+ path=str(path),
223
+ elements=[],
224
+ format=ext,
225
+ status="ERROR",
226
+ error=str(e)
227
+ )
228
+
229
+
230
+ def load_documents_with_docling(
231
+ dir_path: str,
232
+ extensions: Optional[List[str]] = None,
233
+ max_chars: int = 50000,
234
+ recursive: bool = False
235
+ ) -> List[ParsedDocument]:
236
+ """
237
+ Load multiple documents from a directory using Docling.
238
+
239
+ Args:
240
+ dir_path: Path to directory containing documents
241
+ extensions: List of extensions to process (default: all supported)
242
+ max_chars: Maximum characters per document (skip larger files)
243
+ recursive: Whether to search subdirectories
244
+
245
+ Returns:
246
+ List of ParsedDocument objects
247
+ """
248
+ path = Path(dir_path).expanduser()
249
+
250
+ if not path.is_dir():
251
+ raise FileNotFoundError(f"Directory not found: {dir_path}")
252
+
253
+ if extensions is None:
254
+ extensions = list(SUPPORTED_EXTENSIONS)
255
+ else:
256
+ extensions = [e if e.startswith(".") else f".{e}" for e in extensions]
257
+
258
+ # Find all matching files
259
+ files = []
260
+ for ext in extensions:
261
+ pattern = f"**/*{ext}" if recursive else f"*{ext}"
262
+ files.extend(path.glob(pattern))
263
+
264
+ files = sorted(set(files))
265
+
266
+ documents = []
267
+ for file_path in files:
268
+ doc = load_document_with_docling(str(file_path))
269
+
270
+ # Check size limit
271
+ if doc.status == "OK" and doc.chars > max_chars:
272
+ doc.status = "SKIPPED_TOO_LARGE"
273
+ doc.error = f"Document exceeds {max_chars} chars ({doc.chars})"
274
+ doc.elements = []
275
+
276
+ documents.append(doc)
277
+
278
+ return documents
279
+
280
+
281
+ def convert_to_legacy_format(docs: List[ParsedDocument]) -> List[Dict]:
282
+ """
283
+ Convert ParsedDocument list to legacy format for backward compatibility.
284
+
285
+ Args:
286
+ docs: List of ParsedDocument objects
287
+
288
+ Returns:
289
+ List of dicts matching load_markdown_docs output format
290
+ """
291
+ legacy = []
292
+ for doc in docs:
293
+ legacy.append({
294
+ "filename": doc.filename,
295
+ "path": doc.path,
296
+ "text": doc.full_text if doc.status == "OK" else None,
297
+ "chars": doc.chars,
298
+ "words": doc.words,
299
+ "status": doc.status,
300
+ "format": doc.format,
301
+ "elements": doc.elements, # Additional: structured elements
302
+ "page_count": doc.page_count,
303
+ "metadata": doc.metadata
304
+ })
305
+ return legacy
306
+
307
+
308
+ def print_summary(docs: List[ParsedDocument]):
309
+ """Print summary of loaded documents."""
310
+ if not docs:
311
+ print("No documents found or all were skipped.")
312
+ return
313
+
314
+ print(f"{'FILENAME':40} {'FORMAT':8} {'STATUS':20} {'CHARS':>8} {'ELEMENTS':>8}")
315
+ print("-" * 90)
316
+
317
+ for d in docs:
318
+ name = d.filename[:40]
319
+ fmt = d.format[:8]
320
+ status = d.status[:20]
321
+ chars = d.chars
322
+ elements = len(d.elements)
323
+ print(f"{name:40} {fmt:8} {status:20} {chars:8d} {elements:8d}")
324
+
325
+ ok_count = sum(1 for d in docs if d.status == "OK")
326
+ skipped = len(docs) - ok_count
327
+ print("-" * 90)
328
+ print(f"Total: {len(docs)} OK: {ok_count} Skipped/Errors: {skipped}")
329
+
330
+
331
+ if __name__ == "__main__":
332
+ import argparse
333
+
334
+ parser = argparse.ArgumentParser(
335
+ description="Load documents using Docling for RAG ingestion."
336
+ )
337
+ parser.add_argument("dir", help="Directory containing documents")
338
+ parser.add_argument(
339
+ "--extensions", "-e",
340
+ nargs="+",
341
+ default=None,
342
+ help="File extensions to process (default: all supported)"
343
+ )
344
+ parser.add_argument(
345
+ "--max-chars",
346
+ type=int,
347
+ default=50000,
348
+ help="Max characters to accept (default: 50000)"
349
+ )
350
+ parser.add_argument(
351
+ "--recursive", "-r",
352
+ action="store_true",
353
+ help="Search subdirectories recursively"
354
+ )
355
+
356
+ args = parser.parse_args()
357
+
358
+ docs = load_documents_with_docling(
359
+ args.dir,
360
+ extensions=args.extensions,
361
+ max_chars=args.max_chars,
362
+ recursive=args.recursive
363
+ )
364
+ print_summary(docs)
src/ingestion/load_docs.py CHANGED
@@ -1,13 +1,13 @@
1
  # RAG-document-assistant/ingestion/load_docs.py
2
  """
3
- Simple markdown document loader for Day-3 ingestion step.
4
 
5
- Functions:
6
- - load_markdown_docs(dir_path, ext='.md', max_chars=20000)
7
- -> returns list of dicts: { "filename", "path", "text", "chars", "words" }
8
 
9
  CLI:
10
- > python3 load_docs.py /full/path/to/your/markdown/folder
11
  prints a summary table for each file and exits with code 0.
12
  """
13
 
@@ -15,7 +15,10 @@ import os
15
  import glob
16
  import argparse
17
  import re
18
- from typing import List, Dict
 
 
 
19
 
20
  def _clean_markdown(text: str) -> str:
21
  """
@@ -128,12 +131,97 @@ def print_summary(docs: List[Dict]):
128
  print("-" * 80)
129
  print(f"Total files: {len(docs)} OK: {ok_count} Skipped: {skipped}")
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  if __name__ == "__main__":
132
- parser = argparse.ArgumentParser(description="Load and summarize markdown docs for RAG ingestion.")
133
- parser.add_argument("dir", help="Directory containing markdown (.md) files")
134
- parser.add_argument("--ext", default=".md", help="File extension to load")
135
- parser.add_argument("--max-chars", type=int, default=20000, help="Max cleaned characters to accept (default 20k)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  args = parser.parse_args()
137
 
138
- docs = load_markdown_docs(args.dir, ext=args.ext, max_chars=args.max_chars)
139
- print_summary(docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # RAG-document-assistant/ingestion/load_docs.py
2
  """
3
+ Document loader for RAG ingestion.
4
 
5
+ Provides:
6
+ - load_markdown_docs(): Legacy markdown-only loader
7
+ - load_documents(): Unified loader (uses Docling if available, falls back to markdown)
8
 
9
  CLI:
10
+ > python3 load_docs.py /full/path/to/your/docs/folder
11
  prints a summary table for each file and exits with code 0.
12
  """
13
 
 
15
  import glob
16
  import argparse
17
  import re
18
+ import logging
19
+ from typing import List, Dict, Optional
20
+
21
+ logger = logging.getLogger(__name__)
22
 
23
  def _clean_markdown(text: str) -> str:
24
  """
 
131
  print("-" * 80)
132
  print(f"Total files: {len(docs)} OK: {ok_count} Skipped: {skipped}")
133
 
134
+ # Try to import Docling loader
135
+ DOCLING_AVAILABLE = False
136
+ try:
137
+ from src.ingestion.docling_loader import (
138
+ load_documents_with_docling,
139
+ convert_to_legacy_format,
140
+ print_summary as docling_print_summary,
141
+ SUPPORTED_EXTENSIONS
142
+ )
143
+ DOCLING_AVAILABLE = True
144
+ except ImportError:
145
+ SUPPORTED_EXTENSIONS = {".md", ".markdown"}
146
+
147
+
148
+ def load_documents(
149
+ dir_path: str,
150
+ extensions: Optional[List[str]] = None,
151
+ max_chars: int = 50000,
152
+ use_docling: bool = True,
153
+ recursive: bool = False
154
+ ) -> List[Dict]:
155
+ """
156
+ Unified document loader - uses Docling if available, falls back to markdown.
157
+
158
+ Args:
159
+ dir_path: Path to directory containing documents
160
+ extensions: File extensions to process (None = all supported)
161
+ max_chars: Maximum characters per document
162
+ use_docling: Prefer Docling if available
163
+ recursive: Search subdirectories
164
+
165
+ Returns:
166
+ List of document dicts with text and metadata
167
+ """
168
+ if use_docling and DOCLING_AVAILABLE:
169
+ logger.info("Using Docling multi-format loader")
170
+ parsed = load_documents_with_docling(
171
+ dir_path,
172
+ extensions=extensions,
173
+ max_chars=max_chars,
174
+ recursive=recursive
175
+ )
176
+ return convert_to_legacy_format(parsed)
177
+ else:
178
+ logger.info("Using legacy markdown loader")
179
+ ext = ".md"
180
+ if extensions and len(extensions) > 0:
181
+ ext = extensions[0] if extensions[0].startswith(".") else f".{extensions[0]}"
182
+ return load_markdown_docs(dir_path, ext=ext, max_chars=max_chars)
183
+
184
+
185
  if __name__ == "__main__":
186
+ parser = argparse.ArgumentParser(
187
+ description="Load and summarize documents for RAG ingestion."
188
+ )
189
+ parser.add_argument("dir", help="Directory containing documents")
190
+ parser.add_argument(
191
+ "--ext", "-e",
192
+ nargs="+",
193
+ default=None,
194
+ help="File extensions to load (default: all supported)"
195
+ )
196
+ parser.add_argument(
197
+ "--max-chars",
198
+ type=int,
199
+ default=50000,
200
+ help="Max characters to accept (default 50k)"
201
+ )
202
+ parser.add_argument(
203
+ "--no-docling",
204
+ action="store_true",
205
+ help="Disable Docling, use markdown-only loader"
206
+ )
207
+ parser.add_argument(
208
+ "--recursive", "-r",
209
+ action="store_true",
210
+ help="Search subdirectories"
211
+ )
212
  args = parser.parse_args()
213
 
214
+ if args.no_docling or not DOCLING_AVAILABLE:
215
+ # Legacy markdown mode
216
+ ext = args.ext[0] if args.ext else ".md"
217
+ docs = load_markdown_docs(args.dir, ext=ext, max_chars=args.max_chars)
218
+ print_summary(docs)
219
+ else:
220
+ # Docling multi-format mode
221
+ parsed = load_documents_with_docling(
222
+ args.dir,
223
+ extensions=args.ext,
224
+ max_chars=args.max_chars,
225
+ recursive=args.recursive
226
+ )
227
+ docling_print_summary(parsed)