Spaces:
Running on T4
Running on T4
Commit ·
0111393
1
Parent(s): e8991b2
fix: remove GPU-dependent pre-download, use restructure_pages for cross-page tables, robust md extraction
Browse files- Dockerfile +4 -10
- pipeline.py +26 -18
Dockerfile
CHANGED
|
@@ -82,15 +82,9 @@ RUN echo "========== STEP 4: Installing Python dependencies ==========" && \
|
|
| 82 |
pip list --user && \
|
| 83 |
echo "========== Python dependencies installed =========="
|
| 84 |
|
| 85 |
-
#
|
| 86 |
-
#
|
| 87 |
-
RUN echo "========== STEP 5:
|
| 88 |
-
CUDA_VISIBLE_DEVICES="" python -c "from paddleocr import PaddleOCRVL; PaddleOCRVL(use_gpu=False)" && \
|
| 89 |
-
echo "Model cache summary:" && \
|
| 90 |
-
du -sh /home/user/.cache/paddleocr 2>/dev/null || echo " PaddleOCR cache: (empty)" && \
|
| 91 |
-
du -sh /home/user/.cache/huggingface 2>/dev/null || echo " HF cache: (empty)" && \
|
| 92 |
-
du -sh /home/user/.cache 2>/dev/null || echo " Total cache: (empty)" && \
|
| 93 |
-
echo "========== PaddleOCR-VL-1.5 model downloaded =========="
|
| 94 |
|
| 95 |
# Copy application code
|
| 96 |
COPY --chown=user:user . .
|
|
@@ -104,7 +98,7 @@ RUN echo "========== STEP 6: Finalizing build ==========" && \
|
|
| 104 |
EXPOSE 7860
|
| 105 |
|
| 106 |
# Health check
|
| 107 |
-
HEALTHCHECK --interval=30s --timeout=30s --start-period=
|
| 108 |
CMD curl -f http://localhost:7860/ || exit 1
|
| 109 |
|
| 110 |
# Single-process FastAPI app (no vLLM sidecar needed)
|
|
|
|
| 82 |
pip list --user && \
|
| 83 |
echo "========== Python dependencies installed =========="
|
| 84 |
|
| 85 |
+
# NOTE: Model pre-download skipped — PaddlePaddle GPU requires CUDA at import time,
|
| 86 |
+
# which is unavailable during Docker build. Model downloads on first startup (~60s).
|
| 87 |
+
RUN echo "========== STEP 5: Skipping model pre-download (no GPU during build) =========="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
# Copy application code
|
| 90 |
COPY --chown=user:user . .
|
|
|
|
| 98 |
EXPOSE 7860
|
| 99 |
|
| 100 |
# Health check
|
| 101 |
+
HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=5 \
|
| 102 |
CMD curl -f http://localhost:7860/ || exit 1
|
| 103 |
|
| 104 |
# Single-process FastAPI app (no vLLM sidecar needed)
|
pipeline.py
CHANGED
|
@@ -29,6 +29,15 @@ def _get_pipeline():
|
|
| 29 |
return _pipeline
|
| 30 |
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def _page_has_tables(result) -> bool:
|
| 33 |
"""Check if PaddleOCR result contains table elements from layout analysis.
|
| 34 |
|
|
@@ -43,11 +52,7 @@ def _page_has_tables(result) -> bool:
|
|
| 43 |
if block.get('label', '').lower() == 'table':
|
| 44 |
return True
|
| 45 |
# Fallback: check markdown content for table patterns
|
| 46 |
-
|
| 47 |
-
if isinstance(md, dict):
|
| 48 |
-
md_text = md.get('markdown_texts', '')
|
| 49 |
-
else:
|
| 50 |
-
md_text = str(md)
|
| 51 |
return bool(re.search(r'^\|.+\|.+\|$', md_text, re.MULTILINE))
|
| 52 |
except Exception:
|
| 53 |
return False
|
|
@@ -113,19 +118,27 @@ def _convert_document(
|
|
| 113 |
# ---- PASS 1: PaddleOCR-VL-1.5 on full PDF ----
|
| 114 |
pipeline = _get_pipeline()
|
| 115 |
paddle_start = time.time()
|
| 116 |
-
output = pipeline.predict(str(input_path))
|
| 117 |
paddle_time = time.time() - paddle_start
|
| 118 |
|
| 119 |
-
# Collect per-page
|
| 120 |
-
|
| 121 |
table_pages = set()
|
| 122 |
-
for i, res in enumerate(
|
| 123 |
-
md_data = res.markdown
|
| 124 |
-
page_markdowns.append(md_data)
|
| 125 |
-
# Check if this page has tables from layout analysis
|
| 126 |
if _page_has_tables(res):
|
| 127 |
table_pages.add(i)
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
logger.info(
|
| 130 |
f"[{request_id}] Pass 1: PaddleOCR-VL-1.5 processed {len(page_markdowns)} pages "
|
| 131 |
f"in {paddle_time:.2f}s — {len(table_pages)} table pages detected"
|
|
@@ -184,12 +197,7 @@ def _convert_document(
|
|
| 184 |
if i in gemini_page_texts:
|
| 185 |
md_parts.append(gemini_page_texts[i])
|
| 186 |
else:
|
| 187 |
-
|
| 188 |
-
if isinstance(md_data, dict):
|
| 189 |
-
md_text = md_data.get("markdown_texts", "")
|
| 190 |
-
else:
|
| 191 |
-
md_text = str(md_data)
|
| 192 |
-
md_parts.append(md_text)
|
| 193 |
|
| 194 |
markdown_content = "\n\n".join(md_parts)
|
| 195 |
|
|
|
|
| 29 |
return _pipeline
|
| 30 |
|
| 31 |
|
| 32 |
+
def _extract_markdown_text(md_data) -> str:
|
| 33 |
+
"""Extract markdown text from PaddleOCR result, handling different formats."""
|
| 34 |
+
if isinstance(md_data, dict):
|
| 35 |
+
# Official API returns {"text": "...", "images": {...}}
|
| 36 |
+
# Some versions use "markdown_texts"
|
| 37 |
+
return md_data.get("text", "") or md_data.get("markdown_texts", "")
|
| 38 |
+
return str(md_data) if md_data else ""
|
| 39 |
+
|
| 40 |
+
|
| 41 |
def _page_has_tables(result) -> bool:
|
| 42 |
"""Check if PaddleOCR result contains table elements from layout analysis.
|
| 43 |
|
|
|
|
| 52 |
if block.get('label', '').lower() == 'table':
|
| 53 |
return True
|
| 54 |
# Fallback: check markdown content for table patterns
|
| 55 |
+
md_text = _extract_markdown_text(result.markdown)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
return bool(re.search(r'^\|.+\|.+\|$', md_text, re.MULTILINE))
|
| 57 |
except Exception:
|
| 58 |
return False
|
|
|
|
| 118 |
# ---- PASS 1: PaddleOCR-VL-1.5 on full PDF ----
|
| 119 |
pipeline = _get_pipeline()
|
| 120 |
paddle_start = time.time()
|
| 121 |
+
output = pipeline.predict(input=str(input_path))
|
| 122 |
paddle_time = time.time() - paddle_start
|
| 123 |
|
| 124 |
+
# Collect per-page results and detect table pages
|
| 125 |
+
page_results = list(output)
|
| 126 |
table_pages = set()
|
| 127 |
+
for i, res in enumerate(page_results):
|
|
|
|
|
|
|
|
|
|
| 128 |
if _page_has_tables(res):
|
| 129 |
table_pages.add(i)
|
| 130 |
|
| 131 |
+
# Use restructure_pages for cross-page table merging and title leveling
|
| 132 |
+
try:
|
| 133 |
+
restructured = pipeline.restructure_pages(
|
| 134 |
+
page_results, merge_table=True, relevel_titles=True
|
| 135 |
+
)
|
| 136 |
+
page_markdowns = [res.markdown for res in restructured]
|
| 137 |
+
logger.info(f"[{request_id}] restructure_pages applied (merge_table + relevel_titles)")
|
| 138 |
+
except Exception as e:
|
| 139 |
+
logger.warning(f"[{request_id}] restructure_pages failed ({e}), using raw results")
|
| 140 |
+
page_markdowns = [res.markdown for res in page_results]
|
| 141 |
+
|
| 142 |
logger.info(
|
| 143 |
f"[{request_id}] Pass 1: PaddleOCR-VL-1.5 processed {len(page_markdowns)} pages "
|
| 144 |
f"in {paddle_time:.2f}s — {len(table_pages)} table pages detected"
|
|
|
|
| 197 |
if i in gemini_page_texts:
|
| 198 |
md_parts.append(gemini_page_texts[i])
|
| 199 |
else:
|
| 200 |
+
md_parts.append(_extract_markdown_text(md_data))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
markdown_content = "\n\n".join(md_parts)
|
| 203 |
|