ibadrehman-outcome commited on
Commit
cf7950b
·
1 Parent(s): c28aa68

feat: add Excel (.xlsx/.xlsm) parsing support via Docling

Browse files

Adds excel_pipeline.py with a dedicated Docling DocumentConverter for
InputFormat.XLSX. Both /parse and /parse/url endpoints now accept .xlsx
and .xlsm files. The PDF parsing routine is completely unchanged.

Files changed (2) hide show
  1. app.py +64 -29
  2. excel_pipeline.py +69 -0
app.py CHANGED
@@ -38,6 +38,7 @@ from config import (
38
  logger,
39
  )
40
  from models import HealthResponse, ParseResponse, URLParseRequest
 
41
  from pipeline import (
42
  _convert_document,
43
  _create_images_zip,
@@ -134,7 +135,7 @@ async def parse_document(
134
  detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
135
  )
136
 
137
- allowed_extensions = {".pdf"}
138
  file_ext = Path(file.filename).suffix.lower() if file.filename else ""
139
  if file_ext not in allowed_extensions:
140
  raise HTTPException(
@@ -142,7 +143,12 @@ async def parse_document(
142
  detail=f"Unsupported file type. Allowed: {', '.join(sorted(allowed_extensions))}",
143
  )
144
 
145
- logger.info(f"[{request_id}] Page range: {start_page} to {end_page if end_page is not None else 'end'}")
 
 
 
 
 
146
 
147
  temp_dir = tempfile.mkdtemp()
148
  try:
@@ -152,19 +158,28 @@ async def parse_document(
152
  output_dir = Path(temp_dir) / "output"
153
  output_dir.mkdir(exist_ok=True)
154
 
155
- markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
156
- _convert_document,
157
- input_path,
158
- output_dir,
159
- include_images,
160
- request_id,
161
- start_page,
162
- end_page,
163
- )
164
-
165
  images_zip = None
166
- if include_images and image_count > 0:
167
- images_zip, image_count = _create_images_zip(output_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  total_duration = time.time() - start_time
170
  logger.info(f"[{request_id}] Request completed successfully in {total_duration:.2f}s")
@@ -177,7 +192,7 @@ async def parse_document(
177
  image_count=image_count,
178
  pages_processed=pages_processed,
179
  device_used="cpu",
180
- vlm_model="Docling + Gemini",
181
  gemini_page_count=len(gemini_pages),
182
  gemini_pages=gemini_pages,
183
  )
@@ -225,32 +240,52 @@ async def parse_document_from_url(
225
 
226
  url_path = Path(request.url.split("?")[0])
227
  file_ext = url_path.suffix.lower()
228
- if not file_ext or file_ext not in {".pdf"}:
 
229
  content_type = response.headers.get("content-type", "").lower()
230
  ct_map = {
231
  "application/pdf": ".pdf",
 
 
232
  }
233
  file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf")
234
 
 
 
 
 
 
 
 
 
235
  input_path = Path(temp_dir) / f"input{file_ext}"
236
  await asyncio.to_thread(_save_downloaded_content, input_path, response.content)
237
 
238
  output_dir = Path(temp_dir) / "output"
239
  output_dir.mkdir(exist_ok=True)
240
 
241
- markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
242
- _convert_document,
243
- input_path,
244
- output_dir,
245
- request.include_images,
246
- request_id,
247
- request.start_page,
248
- request.end_page,
249
- )
250
-
251
  images_zip = None
252
- if request.include_images and image_count > 0:
253
- images_zip, image_count = _create_images_zip(output_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
  total_duration = time.time() - start_time
256
  logger.info(f"[{request_id}] URL request completed successfully in {total_duration:.2f}s")
@@ -263,7 +298,7 @@ async def parse_document_from_url(
263
  image_count=image_count,
264
  pages_processed=pages_processed,
265
  device_used="cpu",
266
- vlm_model="Docling + Gemini",
267
  gemini_page_count=len(gemini_pages),
268
  gemini_pages=gemini_pages,
269
  )
 
38
  logger,
39
  )
40
  from models import HealthResponse, ParseResponse, URLParseRequest
41
+ from excel_pipeline import _convert_excel
42
  from pipeline import (
43
  _convert_document,
44
  _create_images_zip,
 
135
  detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
136
  )
137
 
138
+ allowed_extensions = {".pdf", ".xlsx", ".xlsm"}
139
  file_ext = Path(file.filename).suffix.lower() if file.filename else ""
140
  if file_ext not in allowed_extensions:
141
  raise HTTPException(
 
143
  detail=f"Unsupported file type. Allowed: {', '.join(sorted(allowed_extensions))}",
144
  )
145
 
146
+ is_excel = file_ext in {".xlsx", ".xlsm"}
147
+
148
+ if is_excel:
149
+ logger.info(f"[{request_id}] File type: Excel ({file_ext})")
150
+ else:
151
+ logger.info(f"[{request_id}] Page range: {start_page} to {end_page if end_page is not None else 'end'}")
152
 
153
  temp_dir = tempfile.mkdtemp()
154
  try:
 
158
  output_dir = Path(temp_dir) / "output"
159
  output_dir.mkdir(exist_ok=True)
160
 
 
 
 
 
 
 
 
 
 
 
161
  images_zip = None
162
+ image_count = 0
163
+ gemini_pages: list[int] = []
164
+
165
+ if is_excel:
166
+ markdown_content, json_content, pages_processed = await asyncio.to_thread(
167
+ _convert_excel,
168
+ input_path,
169
+ request_id,
170
+ )
171
+ else:
172
+ markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
173
+ _convert_document,
174
+ input_path,
175
+ output_dir,
176
+ include_images,
177
+ request_id,
178
+ start_page,
179
+ end_page,
180
+ )
181
+ if include_images and image_count > 0:
182
+ images_zip, image_count = _create_images_zip(output_dir)
183
 
184
  total_duration = time.time() - start_time
185
  logger.info(f"[{request_id}] Request completed successfully in {total_duration:.2f}s")
 
192
  image_count=image_count,
193
  pages_processed=pages_processed,
194
  device_used="cpu",
195
+ vlm_model="Docling + Gemini" if not is_excel else "Docling",
196
  gemini_page_count=len(gemini_pages),
197
  gemini_pages=gemini_pages,
198
  )
 
240
 
241
  url_path = Path(request.url.split("?")[0])
242
  file_ext = url_path.suffix.lower()
243
+ allowed_extensions = {".pdf", ".xlsx", ".xlsm"}
244
+ if not file_ext or file_ext not in allowed_extensions:
245
  content_type = response.headers.get("content-type", "").lower()
246
  ct_map = {
247
  "application/pdf": ".pdf",
248
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
249
+ "application/vnd.ms-excel.sheet.macroenabled.12": ".xlsm",
250
  }
251
  file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf")
252
 
253
+ if file_ext not in allowed_extensions:
254
+ raise HTTPException(
255
+ status_code=400,
256
+ detail=f"Unsupported file type. Allowed: {', '.join(sorted(allowed_extensions))}",
257
+ )
258
+
259
+ is_excel = file_ext in {".xlsx", ".xlsm"}
260
+
261
  input_path = Path(temp_dir) / f"input{file_ext}"
262
  await asyncio.to_thread(_save_downloaded_content, input_path, response.content)
263
 
264
  output_dir = Path(temp_dir) / "output"
265
  output_dir.mkdir(exist_ok=True)
266
 
 
 
 
 
 
 
 
 
 
 
267
  images_zip = None
268
+ image_count = 0
269
+ gemini_pages: list[int] = []
270
+
271
+ if is_excel:
272
+ markdown_content, json_content, pages_processed = await asyncio.to_thread(
273
+ _convert_excel,
274
+ input_path,
275
+ request_id,
276
+ )
277
+ else:
278
+ markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
279
+ _convert_document,
280
+ input_path,
281
+ output_dir,
282
+ request.include_images,
283
+ request_id,
284
+ request.start_page,
285
+ request.end_page,
286
+ )
287
+ if request.include_images and image_count > 0:
288
+ images_zip, image_count = _create_images_zip(output_dir)
289
 
290
  total_duration = time.time() - start_time
291
  logger.info(f"[{request_id}] URL request completed successfully in {total_duration:.2f}s")
 
298
  image_count=image_count,
299
  pages_processed=pages_processed,
300
  device_used="cpu",
301
+ vlm_model="Docling + Gemini" if not is_excel else "Docling",
302
  gemini_page_count=len(gemini_pages),
303
  gemini_pages=gemini_pages,
304
  )
excel_pipeline.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Excel document parsing pipeline using Docling.
2
+
3
+ Uses Docling's native InputFormat.XLSX support to convert Excel workbooks
4
+ to markdown. Each worksheet is treated as a page in the DoclingDocument.
5
+
6
+ This is intentionally separate from the PDF pipeline (_get_converter /
7
+ _convert_document in pipeline.py) and does not share state with it.
8
+ """
9
+
10
+ import time
11
+ from pathlib import Path
12
+
13
+ from docling.datamodel.base_models import InputFormat
14
+ from docling.document_converter import DocumentConverter
15
+
16
+ from config import logger
17
+
18
+ _excel_converter: DocumentConverter | None = None
19
+
20
+
21
+ def _get_excel_converter() -> DocumentConverter:
22
+ """Get or create the global Docling converter for Excel files.
23
+
24
+ A separate singleton from the PDF converter so Excel and PDF
25
+ processing never share pipeline state.
26
+ """
27
+ global _excel_converter
28
+ if _excel_converter is None:
29
+ _excel_converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])
30
+ logger.info("Docling Excel converter initialised")
31
+ return _excel_converter
32
+
33
+
34
+ def _convert_excel(
35
+ input_path: Path,
36
+ request_id: str,
37
+ ) -> tuple[str, None, int]:
38
+ """Convert an Excel workbook (.xlsx / .xlsm) to markdown via Docling.
39
+
40
+ Args:
41
+ input_path: Path to the workbook file.
42
+ request_id: Short request ID used for log correlation.
43
+
44
+ Returns:
45
+ A 3-tuple of:
46
+ - markdown_content: Docling-generated markdown for all sheets.
47
+ - json_content: None (reserved, consistent with PDF pipeline).
48
+ - sheets_processed: Number of pages (worksheets) Docling processed.
49
+
50
+ Raises:
51
+ Exception: Re-raises any Docling conversion error for the caller
52
+ to handle and surface as a 500 response.
53
+ """
54
+ t_start = time.time()
55
+ logger.info(f"[{request_id}] Starting Excel conversion: {input_path.name}")
56
+
57
+ converter = _get_excel_converter()
58
+ result = converter.convert(str(input_path))
59
+
60
+ markdown = result.document.export_to_markdown()
61
+ sheets_processed = len(result.document.pages)
62
+
63
+ elapsed = time.time() - t_start
64
+ logger.info(
65
+ f"[{request_id}] Excel conversion complete: "
66
+ f"{sheets_processed} sheet(s) in {elapsed:.2f}s"
67
+ )
68
+
69
+ return markdown, None, sheets_processed