sidoutcome commited on
Commit
0111393
·
1 Parent(s): e8991b2

fix: remove GPU-dependent pre-download, use restructure_pages for cross-page tables, robust md extraction

Browse files
Files changed (2) hide show
  1. Dockerfile +4 -10
  2. pipeline.py +26 -18
Dockerfile CHANGED
@@ -82,15 +82,9 @@ RUN echo "========== STEP 4: Installing Python dependencies ==========" && \
82
  pip list --user && \
83
  echo "========== Python dependencies installed =========="
84
 
85
- # Pre-download PaddleOCR-VL-1.5 model at build time (avoids download on first request)
86
- # CUDA_VISIBLE_DEVICES="" forces CPU mode since no GPU is available during build
87
- RUN echo "========== STEP 5: Pre-downloading PaddleOCR-VL-1.5 model ==========" && \
88
- CUDA_VISIBLE_DEVICES="" python -c "from paddleocr import PaddleOCRVL; PaddleOCRVL(use_gpu=False)" && \
89
- echo "Model cache summary:" && \
90
- du -sh /home/user/.cache/paddleocr 2>/dev/null || echo " PaddleOCR cache: (empty)" && \
91
- du -sh /home/user/.cache/huggingface 2>/dev/null || echo " HF cache: (empty)" && \
92
- du -sh /home/user/.cache 2>/dev/null || echo " Total cache: (empty)" && \
93
- echo "========== PaddleOCR-VL-1.5 model downloaded =========="
94
 
95
  # Copy application code
96
  COPY --chown=user:user . .
@@ -104,7 +98,7 @@ RUN echo "========== STEP 6: Finalizing build ==========" && \
104
  EXPOSE 7860
105
 
106
  # Health check
107
- HEALTHCHECK --interval=30s --timeout=30s --start-period=120s --retries=5 \
108
  CMD curl -f http://localhost:7860/ || exit 1
109
 
110
  # Single-process FastAPI app (no vLLM sidecar needed)
 
82
  pip list --user && \
83
  echo "========== Python dependencies installed =========="
84
 
85
+ # NOTE: Model pre-download skipped PaddlePaddle GPU requires CUDA at import time,
86
+ # which is unavailable during Docker build. Model downloads on first startup (~60s).
87
+ RUN echo "========== STEP 5: Skipping model pre-download (no GPU during build) =========="
 
 
 
 
 
 
88
 
89
  # Copy application code
90
  COPY --chown=user:user . .
 
98
  EXPOSE 7860
99
 
100
  # Health check
101
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=300s --retries=5 \
102
  CMD curl -f http://localhost:7860/ || exit 1
103
 
104
  # Single-process FastAPI app (no vLLM sidecar needed)
pipeline.py CHANGED
@@ -29,6 +29,15 @@ def _get_pipeline():
29
  return _pipeline
30
 
31
 
 
 
 
 
 
 
 
 
 
32
  def _page_has_tables(result) -> bool:
33
  """Check if PaddleOCR result contains table elements from layout analysis.
34
 
@@ -43,11 +52,7 @@ def _page_has_tables(result) -> bool:
43
  if block.get('label', '').lower() == 'table':
44
  return True
45
  # Fallback: check markdown content for table patterns
46
- md = result.markdown
47
- if isinstance(md, dict):
48
- md_text = md.get('markdown_texts', '')
49
- else:
50
- md_text = str(md)
51
  return bool(re.search(r'^\|.+\|.+\|$', md_text, re.MULTILINE))
52
  except Exception:
53
  return False
@@ -113,19 +118,27 @@ def _convert_document(
113
  # ---- PASS 1: PaddleOCR-VL-1.5 on full PDF ----
114
  pipeline = _get_pipeline()
115
  paddle_start = time.time()
116
- output = pipeline.predict(str(input_path))
117
  paddle_time = time.time() - paddle_start
118
 
119
- # Collect per-page markdown and detect table pages
120
- page_markdowns = []
121
  table_pages = set()
122
- for i, res in enumerate(output):
123
- md_data = res.markdown
124
- page_markdowns.append(md_data)
125
- # Check if this page has tables from layout analysis
126
  if _page_has_tables(res):
127
  table_pages.add(i)
128
 
 
 
 
 
 
 
 
 
 
 
 
129
  logger.info(
130
  f"[{request_id}] Pass 1: PaddleOCR-VL-1.5 processed {len(page_markdowns)} pages "
131
  f"in {paddle_time:.2f}s — {len(table_pages)} table pages detected"
@@ -184,12 +197,7 @@ def _convert_document(
184
  if i in gemini_page_texts:
185
  md_parts.append(gemini_page_texts[i])
186
  else:
187
- # Extract markdown text from PaddleOCR result
188
- if isinstance(md_data, dict):
189
- md_text = md_data.get("markdown_texts", "")
190
- else:
191
- md_text = str(md_data)
192
- md_parts.append(md_text)
193
 
194
  markdown_content = "\n\n".join(md_parts)
195
 
 
29
  return _pipeline
30
 
31
 
32
+ def _extract_markdown_text(md_data) -> str:
33
+ """Extract markdown text from PaddleOCR result, handling different formats."""
34
+ if isinstance(md_data, dict):
35
+ # Official API returns {"text": "...", "images": {...}}
36
+ # Some versions use "markdown_texts"
37
+ return md_data.get("text", "") or md_data.get("markdown_texts", "")
38
+ return str(md_data) if md_data else ""
39
+
40
+
41
  def _page_has_tables(result) -> bool:
42
  """Check if PaddleOCR result contains table elements from layout analysis.
43
 
 
52
  if block.get('label', '').lower() == 'table':
53
  return True
54
  # Fallback: check markdown content for table patterns
55
+ md_text = _extract_markdown_text(result.markdown)
 
 
 
 
56
  return bool(re.search(r'^\|.+\|.+\|$', md_text, re.MULTILINE))
57
  except Exception:
58
  return False
 
118
  # ---- PASS 1: PaddleOCR-VL-1.5 on full PDF ----
119
  pipeline = _get_pipeline()
120
  paddle_start = time.time()
121
+ output = pipeline.predict(input=str(input_path))
122
  paddle_time = time.time() - paddle_start
123
 
124
+ # Collect per-page results and detect table pages
125
+ page_results = list(output)
126
  table_pages = set()
127
+ for i, res in enumerate(page_results):
 
 
 
128
  if _page_has_tables(res):
129
  table_pages.add(i)
130
 
131
+ # Use restructure_pages for cross-page table merging and title leveling
132
+ try:
133
+ restructured = pipeline.restructure_pages(
134
+ page_results, merge_table=True, relevel_titles=True
135
+ )
136
+ page_markdowns = [res.markdown for res in restructured]
137
+ logger.info(f"[{request_id}] restructure_pages applied (merge_table + relevel_titles)")
138
+ except Exception as e:
139
+ logger.warning(f"[{request_id}] restructure_pages failed ({e}), using raw results")
140
+ page_markdowns = [res.markdown for res in page_results]
141
+
142
  logger.info(
143
  f"[{request_id}] Pass 1: PaddleOCR-VL-1.5 processed {len(page_markdowns)} pages "
144
  f"in {paddle_time:.2f}s — {len(table_pages)} table pages detected"
 
197
  if i in gemini_page_texts:
198
  md_parts.append(gemini_page_texts[i])
199
  else:
200
+ md_parts.append(_extract_markdown_text(md_data))
 
 
 
 
 
201
 
202
  markdown_content = "\n\n".join(md_parts)
203