Spaces:
Running on T4
Running on T4
Commit Β·
33af535
1
Parent(s): 87afc64
perf: concurrency improvements for high-volume Excel processing
Browse files- Custom ThreadPoolExecutor (THREAD_POOL_SIZE=32) replaces the default
asyncio executor (~8 threads on T4), set on the event loop at startup
- asyncio.Semaphore (EXCEL_CONCURRENCY=20) caps concurrent Excel jobs per
worker to prevent OOM when many large workbooks arrive simultaneously
- start.sh uses UVICORN_WORKERS (default 2) for multi-process parallelism,
doubling effective throughput on a 4-vCPU T4 without touching model RAM limits
- Both thresholds are tunable via environment variables
- Zero changes to PDF or Excel parsing routines
- app.py +33 -11
- config.py +8 -0
- excel_pipeline.py +140 -41
- start.sh +10 -2
app.py
CHANGED
|
@@ -9,6 +9,7 @@ document parsing:
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
import asyncio
|
|
|
|
| 12 |
import re
|
| 13 |
import shutil
|
| 14 |
import tempfile
|
|
@@ -26,6 +27,7 @@ from config import (
|
|
| 26 |
BITMAP_AREA_THRESHOLD,
|
| 27 |
DOCLING_DEVICE,
|
| 28 |
DOCLING_NUM_THREADS,
|
|
|
|
| 29 |
GEMINI_API_KEY,
|
| 30 |
GEMINI_CONCURRENCY,
|
| 31 |
GEMINI_MODEL,
|
|
@@ -35,8 +37,14 @@ from config import (
|
|
| 35 |
MAX_FILE_SIZE_MB,
|
| 36 |
RENDER_DPI,
|
| 37 |
SPARSE_TEXT_THRESHOLD,
|
|
|
|
| 38 |
logger,
|
| 39 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
from models import HealthResponse, ParseResponse, URLParseRequest
|
| 41 |
from excel_pipeline import _convert_excel
|
| 42 |
from pipeline import (
|
|
@@ -50,9 +58,20 @@ from pipeline import (
|
|
| 50 |
|
| 51 |
@asynccontextmanager
|
| 52 |
async def lifespan(app: FastAPI):
|
| 53 |
-
"""Startup: initialize Docling converter."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
logger.info("=" * 60)
|
| 55 |
logger.info("Starting Docling VLM Parser API v6.0.0...")
|
|
|
|
|
|
|
| 56 |
logger.info("Initializing Docling converter...")
|
| 57 |
_get_converter()
|
| 58 |
logger.info("Docling converter ready")
|
|
@@ -74,6 +93,7 @@ async def lifespan(app: FastAPI):
|
|
| 74 |
logger.info("=" * 60)
|
| 75 |
yield
|
| 76 |
logger.info("Shutting down Docling VLM Parser API...")
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
app = FastAPI(
|
|
@@ -163,11 +183,12 @@ async def parse_document(
|
|
| 163 |
gemini_pages: list[int] = []
|
| 164 |
|
| 165 |
if is_excel:
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
| 171 |
else:
|
| 172 |
markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
|
| 173 |
_convert_document,
|
|
@@ -269,11 +290,12 @@ async def parse_document_from_url(
|
|
| 269 |
gemini_pages: list[int] = []
|
| 270 |
|
| 271 |
if is_excel:
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
|
|
|
| 277 |
else:
|
| 278 |
markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
|
| 279 |
_convert_document,
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
import asyncio
|
| 12 |
+
import concurrent.futures
|
| 13 |
import re
|
| 14 |
import shutil
|
| 15 |
import tempfile
|
|
|
|
| 27 |
BITMAP_AREA_THRESHOLD,
|
| 28 |
DOCLING_DEVICE,
|
| 29 |
DOCLING_NUM_THREADS,
|
| 30 |
+
EXCEL_CONCURRENCY,
|
| 31 |
GEMINI_API_KEY,
|
| 32 |
GEMINI_CONCURRENCY,
|
| 33 |
GEMINI_MODEL,
|
|
|
|
| 37 |
MAX_FILE_SIZE_MB,
|
| 38 |
RENDER_DPI,
|
| 39 |
SPARSE_TEXT_THRESHOLD,
|
| 40 |
+
THREAD_POOL_SIZE,
|
| 41 |
logger,
|
| 42 |
)
|
| 43 |
+
|
| 44 |
+
# Semaphore that caps simultaneous Excel conversions across all requests
|
| 45 |
+
# handled by this worker process. Prevents OOM when many large workbooks
|
| 46 |
+
# arrive concurrently (openpyxl loads the full workbook into RAM).
|
| 47 |
+
_excel_semaphore = asyncio.Semaphore(EXCEL_CONCURRENCY)
|
| 48 |
from models import HealthResponse, ParseResponse, URLParseRequest
|
| 49 |
from excel_pipeline import _convert_excel
|
| 50 |
from pipeline import (
|
|
|
|
| 58 |
|
| 59 |
@asynccontextmanager
|
| 60 |
async def lifespan(app: FastAPI):
|
| 61 |
+
"""Startup: configure thread pool, initialize Docling converter."""
|
| 62 |
+
# Replace the default asyncio executor (min(32, cpu+4) β 8 on T4) with a
|
| 63 |
+
# larger pool so burst Excel/PDF requests drain the queue faster instead of
|
| 64 |
+
# stacking up waiting for a free thread.
|
| 65 |
+
executor = concurrent.futures.ThreadPoolExecutor(
|
| 66 |
+
max_workers=THREAD_POOL_SIZE,
|
| 67 |
+
thread_name_prefix="parser",
|
| 68 |
+
)
|
| 69 |
+
asyncio.get_running_loop().set_default_executor(executor)
|
| 70 |
+
|
| 71 |
logger.info("=" * 60)
|
| 72 |
logger.info("Starting Docling VLM Parser API v6.0.0...")
|
| 73 |
+
logger.info(f"Thread pool size: {THREAD_POOL_SIZE}")
|
| 74 |
+
logger.info(f"Excel concurrency cap: {EXCEL_CONCURRENCY}")
|
| 75 |
logger.info("Initializing Docling converter...")
|
| 76 |
_get_converter()
|
| 77 |
logger.info("Docling converter ready")
|
|
|
|
| 93 |
logger.info("=" * 60)
|
| 94 |
yield
|
| 95 |
logger.info("Shutting down Docling VLM Parser API...")
|
| 96 |
+
executor.shutdown(wait=False)
|
| 97 |
|
| 98 |
|
| 99 |
app = FastAPI(
|
|
|
|
| 183 |
gemini_pages: list[int] = []
|
| 184 |
|
| 185 |
if is_excel:
|
| 186 |
+
async with _excel_semaphore:
|
| 187 |
+
markdown_content, json_content, pages_processed = await asyncio.to_thread(
|
| 188 |
+
_convert_excel,
|
| 189 |
+
input_path,
|
| 190 |
+
request_id,
|
| 191 |
+
)
|
| 192 |
else:
|
| 193 |
markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
|
| 194 |
_convert_document,
|
|
|
|
| 290 |
gemini_pages: list[int] = []
|
| 291 |
|
| 292 |
if is_excel:
|
| 293 |
+
async with _excel_semaphore:
|
| 294 |
+
markdown_content, json_content, pages_processed = await asyncio.to_thread(
|
| 295 |
+
_convert_excel,
|
| 296 |
+
input_path,
|
| 297 |
+
request_id,
|
| 298 |
+
)
|
| 299 |
else:
|
| 300 |
markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
|
| 301 |
_convert_document,
|
config.py
CHANGED
|
@@ -27,6 +27,14 @@ GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview")
|
|
| 27 |
GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120"))
|
| 28 |
GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "8"))
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
BLOCKED_HOSTNAMES = {
|
| 31 |
"localhost",
|
| 32 |
"metadata",
|
|
|
|
| 27 |
GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120"))
|
| 28 |
GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "8"))
|
| 29 |
|
| 30 |
+
# Concurrency tuning
|
| 31 |
+
# THREAD_POOL_SIZE: replaces the default asyncio executor (min(32, cpu+4) β 8
|
| 32 |
+
# on a 4-vCPU T4). 32 lets the queue drain much faster under burst load.
|
| 33 |
+
# EXCEL_CONCURRENCY: semaphore cap on simultaneous Excel jobs. Prevents OOM
|
| 34 |
+
# when many large workbooks arrive at once (openpyxl loads full file into RAM).
|
| 35 |
+
THREAD_POOL_SIZE = int(os.getenv("THREAD_POOL_SIZE", "32"))
|
| 36 |
+
EXCEL_CONCURRENCY = int(os.getenv("EXCEL_CONCURRENCY", "20"))
|
| 37 |
+
|
| 38 |
BLOCKED_HOSTNAMES = {
|
| 39 |
"localhost",
|
| 40 |
"metadata",
|
excel_pipeline.py
CHANGED
|
@@ -28,7 +28,7 @@ from config import logger
|
|
| 28 |
|
| 29 |
|
| 30 |
# ---------------------------------------------------------------------------
|
| 31 |
-
#
|
| 32 |
# ---------------------------------------------------------------------------
|
| 33 |
|
| 34 |
def _cell_str(value: Any) -> str:
|
|
@@ -43,6 +43,11 @@ def _cell_str(value: Any) -> str:
|
|
| 43 |
return str(value).strip()
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
def _build_value_map(ws: Worksheet) -> dict[tuple[int, int], str]:
|
| 47 |
"""Return a (row, col) β string map with merged regions resolved.
|
| 48 |
|
|
@@ -63,56 +68,151 @@ def _build_value_map(ws: Worksheet) -> dict[tuple[int, int], str]:
|
|
| 63 |
return values
|
| 64 |
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
"""
|
| 78 |
if not ws.max_row or not ws.max_column:
|
| 79 |
-
return "_Empty sheet_"
|
| 80 |
|
| 81 |
values = _build_value_map(ws)
|
| 82 |
-
max_row: int = ws.max_row
|
| 83 |
max_col: int = ws.max_column
|
| 84 |
|
| 85 |
-
|
| 86 |
[values.get((r, c), "") for c in range(1, max_col + 1)]
|
| 87 |
-
for r in range(1, max_row + 1)
|
| 88 |
]
|
| 89 |
|
| 90 |
-
# Drop
|
| 91 |
-
while
|
| 92 |
-
|
| 93 |
|
| 94 |
-
if not
|
| 95 |
-
return "_Empty sheet_"
|
| 96 |
|
| 97 |
-
|
| 98 |
-
data_rows = rows[1:]
|
| 99 |
|
| 100 |
-
#
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
)
|
| 105 |
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
for row in
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
|
| 118 |
# ---------------------------------------------------------------------------
|
|
@@ -125,9 +225,8 @@ def _convert_excel(
|
|
| 125 |
) -> tuple[str, None, int]:
|
| 126 |
"""Parse an Excel workbook and return markdown with HTML tables.
|
| 127 |
|
| 128 |
-
Each worksheet becomes a ## heading followed by
|
| 129 |
-
|
| 130 |
-
exactly so downstream tasks see a consistent structure.
|
| 131 |
|
| 132 |
Args:
|
| 133 |
input_path: Path to the .xlsx / .xlsm file.
|
|
@@ -135,7 +234,7 @@ def _convert_excel(
|
|
| 135 |
|
| 136 |
Returns:
|
| 137 |
A 3-tuple of:
|
| 138 |
-
- markdown_content:
|
| 139 |
- json_content: None (reserved, consistent with PDF pipeline).
|
| 140 |
- sheets_processed: Number of worksheets converted.
|
| 141 |
|
|
@@ -153,7 +252,7 @@ def _convert_excel(
|
|
| 153 |
f"[{request_id}] Workbook has {len(sheet_names)} sheet(s): {sheet_names}"
|
| 154 |
)
|
| 155 |
|
| 156 |
-
|
| 157 |
|
| 158 |
for sheet_name in sheet_names:
|
| 159 |
ws = wb[sheet_name]
|
|
@@ -161,12 +260,12 @@ def _convert_excel(
|
|
| 161 |
f"[{request_id}] Processing sheet '{sheet_name}': "
|
| 162 |
f"{ws.max_row or 0} rows Γ {ws.max_column or 0} cols"
|
| 163 |
)
|
| 164 |
-
|
| 165 |
-
|
| 166 |
|
| 167 |
wb.close()
|
| 168 |
|
| 169 |
-
markdown = "\n\n---\n\n".join(
|
| 170 |
sheets_processed = len(sheet_names)
|
| 171 |
|
| 172 |
elapsed = time.time() - t_start
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
# ---------------------------------------------------------------------------
|
| 31 |
+
# Cell helpers
|
| 32 |
# ---------------------------------------------------------------------------
|
| 33 |
|
| 34 |
def _cell_str(value: Any) -> str:
|
|
|
|
| 43 |
return str(value).strip()
|
| 44 |
|
| 45 |
|
| 46 |
+
def _cell_html(value: str) -> str:
|
| 47 |
+
"""Escape HTML special chars and replace newlines with <br>."""
|
| 48 |
+
return escape(value).replace("\n", "<br>").replace("\r", "")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
def _build_value_map(ws: Worksheet) -> dict[tuple[int, int], str]:
|
| 52 |
"""Return a (row, col) β string map with merged regions resolved.
|
| 53 |
|
|
|
|
| 68 |
return values
|
| 69 |
|
| 70 |
|
| 71 |
+
# ---------------------------------------------------------------------------
|
| 72 |
+
# Row classification
|
| 73 |
+
# ---------------------------------------------------------------------------
|
| 74 |
+
|
| 75 |
+
def _is_numeric(value: str) -> bool:
|
| 76 |
+
"""Return True if value is a pure number (int, float, or percentage)."""
|
| 77 |
+
cleaned = value.replace(",", "").replace(" ", "").rstrip("%")
|
| 78 |
+
try:
|
| 79 |
+
float(cleaned)
|
| 80 |
+
return True
|
| 81 |
+
except ValueError:
|
| 82 |
+
return False
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _is_header_like(row: list[str]) -> bool:
|
| 86 |
+
"""Return True if the row looks like a column header row.
|
| 87 |
+
|
| 88 |
+
Conditions:
|
| 89 |
+
- At least half of the cells are non-empty.
|
| 90 |
+
- None of the non-empty cells contain a purely numeric value.
|
| 91 |
+
"""
|
| 92 |
+
non_empty = [v for v in row if v]
|
| 93 |
+
if len(non_empty) < max(1, len(row) // 2):
|
| 94 |
+
return False
|
| 95 |
+
return all(not _is_numeric(v) for v in non_empty)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _is_label_row(row: list[str], max_col: int) -> bool:
|
| 99 |
+
"""Return True if only the first cell has content and the rest are empty.
|
| 100 |
+
|
| 101 |
+
Only meaningful when the sheet has more than one column β otherwise
|
| 102 |
+
every single-value row would be misclassified as a heading.
|
| 103 |
+
"""
|
| 104 |
+
return max_col > 1 and bool(row[0]) and all(v == "" for v in row[1:])
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ---------------------------------------------------------------------------
|
| 108 |
+
# HTML table builder
|
| 109 |
+
# ---------------------------------------------------------------------------
|
| 110 |
+
|
| 111 |
+
def _build_html_table(header: list[str] | None, rows: list[list[str]]) -> str:
|
| 112 |
+
"""Render a header + body into an HTML table string.
|
| 113 |
+
|
| 114 |
+
Header cells use <th>; blank header cells fall back to 'ColN'.
|
| 115 |
+
Body rows are padded to the header width.
|
| 116 |
+
Returns an empty string when both header and rows are absent.
|
| 117 |
+
"""
|
| 118 |
+
if not header and not rows:
|
| 119 |
+
return ""
|
| 120 |
+
|
| 121 |
+
col_count = len(header) if header else max((len(r) for r in rows), default=0)
|
| 122 |
+
lines = ["<table>"]
|
| 123 |
+
|
| 124 |
+
if header:
|
| 125 |
+
th_cells = "".join(
|
| 126 |
+
f"<th>{_cell_html(h) if h else f'Col{i + 1}'}</th>"
|
| 127 |
+
for i, h in enumerate(header)
|
| 128 |
+
)
|
| 129 |
+
lines += ["<thead>", f"<tr>{th_cells}</tr>", "</thead>"]
|
| 130 |
+
|
| 131 |
+
if rows:
|
| 132 |
+
lines.append("<tbody>")
|
| 133 |
+
for row in rows:
|
| 134 |
+
padded = row + [""] * max(0, col_count - len(row))
|
| 135 |
+
td_cells = "".join(f"<td>{_cell_html(v)}</td>" for v in padded)
|
| 136 |
+
lines.append(f"<tr>{td_cells}</tr>")
|
| 137 |
+
lines.append("</tbody>")
|
| 138 |
+
|
| 139 |
+
lines.append("</table>")
|
| 140 |
+
return "\n".join(lines)
|
| 141 |
|
| 142 |
|
| 143 |
+
# ---------------------------------------------------------------------------
|
| 144 |
+
# Sheet β sections
|
| 145 |
+
# ---------------------------------------------------------------------------
|
| 146 |
+
|
| 147 |
+
def _sheet_to_sections(ws: Worksheet) -> list[str]:
|
| 148 |
+
"""Convert a worksheet into an ordered list of markdown sections.
|
| 149 |
|
| 150 |
+
Each section is either:
|
| 151 |
+
- A '## heading' string (label row: text in col 0, rest empty)
|
| 152 |
+
- An HTML table string (one <table> per contiguous non-empty block)
|
| 153 |
+
|
| 154 |
+
Rules applied in order for each row:
|
| 155 |
+
1. Fully empty row β flush the current table, start a new one.
|
| 156 |
+
2. Label row β flush, emit ## heading.
|
| 157 |
+
3. First header-like row in the current block β becomes <thead>.
|
| 158 |
+
4. Everything else β <tbody> row.
|
| 159 |
"""
|
| 160 |
if not ws.max_row or not ws.max_column:
|
| 161 |
+
return ["_Empty sheet_"]
|
| 162 |
|
| 163 |
values = _build_value_map(ws)
|
|
|
|
| 164 |
max_col: int = ws.max_column
|
| 165 |
|
| 166 |
+
all_rows: list[list[str]] = [
|
| 167 |
[values.get((r, c), "") for c in range(1, max_col + 1)]
|
| 168 |
+
for r in range(1, ws.max_row + 1)
|
| 169 |
]
|
| 170 |
|
| 171 |
+
# Drop trailing empty rows
|
| 172 |
+
while all_rows and all(v == "" for v in all_rows[-1]):
|
| 173 |
+
all_rows.pop()
|
| 174 |
|
| 175 |
+
if not all_rows:
|
| 176 |
+
return ["_Empty sheet_"]
|
| 177 |
|
| 178 |
+
sections: list[str] = []
|
|
|
|
| 179 |
|
| 180 |
+
# Mutable table accumulator
|
| 181 |
+
table_header: list[str] | None = None
|
| 182 |
+
table_rows: list[list[str]] = []
|
| 183 |
+
header_found = False
|
|
|
|
| 184 |
|
| 185 |
+
def flush() -> None:
|
| 186 |
+
nonlocal table_header, table_rows, header_found
|
| 187 |
+
html = _build_html_table(table_header, table_rows)
|
| 188 |
+
if html:
|
| 189 |
+
sections.append(html)
|
| 190 |
+
table_header = None
|
| 191 |
+
table_rows = []
|
| 192 |
+
header_found = False
|
| 193 |
|
| 194 |
+
for row in all_rows:
|
| 195 |
+
if all(v == "" for v in row):
|
| 196 |
+
# Empty row β end current table block
|
| 197 |
+
flush()
|
| 198 |
+
continue
|
| 199 |
|
| 200 |
+
if _is_label_row(row, max_col):
|
| 201 |
+
# Single-label row β heading
|
| 202 |
+
flush()
|
| 203 |
+
sections.append(f"## {escape(row[0])}")
|
| 204 |
+
continue
|
| 205 |
+
|
| 206 |
+
if not header_found and _is_header_like(row):
|
| 207 |
+
# First header-like row in this block β <thead>
|
| 208 |
+
table_header = row
|
| 209 |
+
header_found = True
|
| 210 |
+
else:
|
| 211 |
+
table_rows.append(row)
|
| 212 |
+
|
| 213 |
+
flush()
|
| 214 |
+
|
| 215 |
+
return sections if sections else ["_Empty sheet_"]
|
| 216 |
|
| 217 |
|
| 218 |
# ---------------------------------------------------------------------------
|
|
|
|
| 225 |
) -> tuple[str, None, int]:
|
| 226 |
"""Parse an Excel workbook and return markdown with HTML tables.
|
| 227 |
|
| 228 |
+
Each worksheet becomes a ## heading followed by its sections (headings
|
| 229 |
+
and HTML tables). Sheets are separated by horizontal rules.
|
|
|
|
| 230 |
|
| 231 |
Args:
|
| 232 |
input_path: Path to the .xlsx / .xlsm file.
|
|
|
|
| 234 |
|
| 235 |
Returns:
|
| 236 |
A 3-tuple of:
|
| 237 |
+
- markdown_content: Full markdown, HTML tables matching Gemini format.
|
| 238 |
- json_content: None (reserved, consistent with PDF pipeline).
|
| 239 |
- sheets_processed: Number of worksheets converted.
|
| 240 |
|
|
|
|
| 252 |
f"[{request_id}] Workbook has {len(sheet_names)} sheet(s): {sheet_names}"
|
| 253 |
)
|
| 254 |
|
| 255 |
+
sheet_blocks: list[str] = []
|
| 256 |
|
| 257 |
for sheet_name in sheet_names:
|
| 258 |
ws = wb[sheet_name]
|
|
|
|
| 260 |
f"[{request_id}] Processing sheet '{sheet_name}': "
|
| 261 |
f"{ws.max_row or 0} rows Γ {ws.max_column or 0} cols"
|
| 262 |
)
|
| 263 |
+
sections = _sheet_to_sections(ws)
|
| 264 |
+
sheet_blocks.append(f"## {sheet_name}\n\n" + "\n\n".join(sections))
|
| 265 |
|
| 266 |
wb.close()
|
| 267 |
|
| 268 |
+
markdown = "\n\n---\n\n".join(sheet_blocks)
|
| 269 |
sheets_processed = len(sheet_names)
|
| 270 |
|
| 271 |
elapsed = time.time() - t_start
|
start.sh
CHANGED
|
@@ -1,4 +1,12 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
-
# Start the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
exec uvicorn app:app
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
+
# Start the Docling + Gemini hybrid parser API.
|
| 3 |
+
#
|
| 4 |
+
# UVICORN_WORKERS: number of worker processes (default 2).
|
| 5 |
+
# Each worker loads its own copy of the Docling model, so don't set this
|
| 6 |
+
# higher than RAM allows. On T4 Small (15GB RAM), 2 is a safe default.
|
| 7 |
+
# Set UVICORN_WORKERS=1 to revert to single-process mode.
|
| 8 |
|
| 9 |
+
exec uvicorn app:app \
|
| 10 |
+
--host 0.0.0.0 \
|
| 11 |
+
--port 7860 \
|
| 12 |
+
--workers "${UVICORN_WORKERS:-2}"
|