Ibad ur Rehman commited on
Commit
dd23733
·
1 Parent(s): b586eeb

feat: switch to unsloth gguf runtime

Browse files
Files changed (6) hide show
  1. Dockerfile +16 -11
  2. app.py +70 -124
  3. config.py +12 -11
  4. pipeline.py +45 -154
  5. requirements.txt +1 -6
  6. start.sh +12 -4
Dockerfile CHANGED
@@ -1,40 +1,45 @@
1
- # Hugging Face Spaces Dockerfile for Qwen3-VL parser API
2
- # v5.1.0 - Qwen3-VL-8B-Instruct local inference
3
 
4
- FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime
5
 
6
  USER root
7
 
8
- # Install fonts and PDF utilities for document parsing
9
  RUN apt-get update && apt-get install -y --no-install-recommends \
 
 
10
  fonts-noto-core fonts-noto-cjk fontconfig \
11
- libgl1 libglib2.0-0 poppler-utils curl git \
12
  && fc-cache -fv && rm -rf /var/lib/apt/lists/*
13
 
14
- # Create non-root user for HF Spaces
15
  RUN useradd -m -u 1000 user
16
 
17
  ENV PYTHONUNBUFFERED=1 \
18
  PYTHONDONTWRITEBYTECODE=1 \
19
  IMAGES_SCALE=2.0 \
20
  MAX_FILE_SIZE_MB=1024 \
 
21
  HF_HOME=/home/user/.cache/huggingface \
22
  XDG_CACHE_HOME=/home/user/.cache \
23
  HOME=/home/user \
24
  PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
25
 
26
- RUN mkdir -p /home/user/.cache/huggingface /home/user/.cache/paddleocr /home/user/app \
27
  && chown -R user:user /home/user
28
 
29
  USER user
30
- WORKDIR /home/user/app
31
 
32
- COPY --chown=user:user requirements.txt .
 
 
 
33
 
34
- RUN pip install --user --upgrade pip && pip install --user -r requirements.txt
 
 
35
 
36
  COPY --chown=user:user . .
37
-
38
  RUN chmod +x start.sh
39
 
40
  EXPOSE 7860
 
1
+ # Hugging Face Spaces Dockerfile for Unsloth GGUF Qwen3-VL parser API
2
+ # v5.2.0 - llama.cpp + Unsloth Qwen3-VL-8B-Instruct GGUF
3
 
4
+ FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
5
 
6
  USER root
7
 
 
8
  RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ python3 python3-pip python3-venv \
10
+ build-essential cmake git \
11
  fonts-noto-core fonts-noto-cjk fontconfig \
12
+ libgl1 libglib2.0-0 poppler-utils curl \
13
  && fc-cache -fv && rm -rf /var/lib/apt/lists/*
14
 
 
15
  RUN useradd -m -u 1000 user
16
 
17
  ENV PYTHONUNBUFFERED=1 \
18
  PYTHONDONTWRITEBYTECODE=1 \
19
  IMAGES_SCALE=2.0 \
20
  MAX_FILE_SIZE_MB=1024 \
21
+ LLAMA_SERVER_URL=http://127.0.0.1:8080 \
22
  HF_HOME=/home/user/.cache/huggingface \
23
  XDG_CACHE_HOME=/home/user/.cache \
24
  HOME=/home/user \
25
  PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
26
 
27
+ RUN mkdir -p /home/user/.cache/huggingface /home/user/app \
28
  && chown -R user:user /home/user
29
 
30
  USER user
31
+ WORKDIR /home/user
32
 
33
+ RUN git clone --depth 1 https://github.com/ggml-org/llama.cpp /home/user/llama.cpp
34
+ WORKDIR /home/user/llama.cpp
35
+ RUN cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON
36
+ RUN cmake --build build -j
37
 
38
+ WORKDIR /home/user/app
39
+ COPY --chown=user:user requirements.txt .
40
+ RUN python3 -m pip install --user --upgrade pip && python3 -m pip install --user -r requirements.txt
41
 
42
  COPY --chown=user:user . .
 
43
  RUN chmod +x start.sh
44
 
45
  EXPOSE 7860
app.py CHANGED
@@ -1,4 +1,4 @@
1
- """Qwen3-VL parser API."""
2
 
3
  import asyncio
4
  import re
@@ -16,14 +16,19 @@ from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
16
  from auth import _validate_url, verify_token
17
  from config import (
18
  IMAGES_SCALE,
 
 
 
 
 
 
 
 
 
 
19
  MAX_FILE_SIZE_BYTES,
20
  MAX_FILE_SIZE_MB,
21
- QWEN_ATTN_IMPLEMENTATION,
22
- QWEN_BATCH_SIZE,
23
- QWEN_IMAGE_MAX_SIDE,
24
- QWEN_MAX_NEW_TOKENS,
25
  QWEN_MODEL,
26
- QWEN_TORCH_DTYPE,
27
  RENDER_DPI,
28
  logger,
29
  )
@@ -37,61 +42,52 @@ from pipeline import (
37
  )
38
 
39
 
40
- # ---------------------------------------------------------------------------
41
- # Application Lifespan
42
- # ---------------------------------------------------------------------------
43
-
44
-
45
  @asynccontextmanager
46
  async def lifespan(app: FastAPI):
47
- """Startup: initialize Qwen3-VL pipeline."""
48
  logger.info("=" * 60)
49
- logger.info("Starting Docling VLM Parser API v5.1.0...")
50
- logger.info("Initializing Qwen3-VL pipeline...")
51
  _get_pipeline()
52
- logger.info("Qwen3-VL ready")
53
 
54
  logger.info(f"Render DPI: {RENDER_DPI}")
55
  logger.info(f"Images scale: {IMAGES_SCALE}")
56
  logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
57
  logger.info(f"Qwen Model: {QWEN_MODEL}")
58
- logger.info(f"Qwen Max New Tokens: {QWEN_MAX_NEW_TOKENS}")
59
- logger.info(f"Qwen Batch Size: {QWEN_BATCH_SIZE}")
60
- logger.info(f"Qwen Image Max Side: {QWEN_IMAGE_MAX_SIDE}")
61
- logger.info(f"Qwen Attention: {QWEN_ATTN_IMPLEMENTATION}")
62
- logger.info(f"Qwen Torch Dtype: {QWEN_TORCH_DTYPE}")
 
 
 
 
 
63
 
64
  logger.info("=" * 60)
65
- logger.info("Docling VLM Parser API ready (Qwen3-VL local parser)")
66
  logger.info("=" * 60)
67
  yield
68
- logger.info("Shutting down Docling VLM Parser API...")
69
-
70
 
71
- # ---------------------------------------------------------------------------
72
- # FastAPI App
73
- # ---------------------------------------------------------------------------
74
 
75
  app = FastAPI(
76
- title="Docling VLM Parser API",
77
- description="Qwen3-VL local parser",
78
- version="5.1.0",
79
  lifespan=lifespan,
80
  )
81
 
82
 
83
- # ---------------------------------------------------------------------------
84
- # Endpoints
85
- # ---------------------------------------------------------------------------
86
-
87
-
88
  @app.get("/", response_model=HealthResponse)
89
  async def health_check() -> HealthResponse:
90
  """Health check endpoint."""
91
  return HealthResponse(
92
  status="healthy",
93
- version="5.1.0",
94
- model="Qwen3-VL-8B-Instruct",
95
  gemini_status="not used",
96
  images_scale=IMAGES_SCALE,
97
  )
@@ -100,34 +96,33 @@ async def health_check() -> HealthResponse:
100
  @app.post("/parse", response_model=ParseResponse)
101
  async def parse_document(
102
  file: UploadFile = File(..., description="PDF or image file to parse"),
103
- output_format: str = Form(default="markdown", description="Output format: markdown or json"),
104
- images_scale: Optional[float] = Form(default=None, description="Image resolution scale"),
105
  start_page: int = Form(default=0, description="Starting page (0-indexed)"),
106
  end_page: Optional[int] = Form(default=None, description="Ending page (None = all pages)"),
107
  include_images: bool = Form(default=False, description="Include extracted images"),
108
  _token: str = Depends(verify_token),
109
  ) -> ParseResponse:
110
- """Parse a document file using Qwen3-VL."""
111
  request_id = str(uuid4())[:8]
112
  start_time = time.time()
113
 
114
- logger.info(f"[{request_id}] {'='*50}")
115
  logger.info(f"[{request_id}] New parse request received")
116
- safe_filename = re.sub(r'[\r\n\t\x00-\x1f\x7f]', '_', file.filename or "")[:255]
117
  logger.info(f"[{request_id}] Filename: {safe_filename}")
118
  logger.info(f"[{request_id}] Output format: {output_format}")
119
 
120
- if output_format not in ("markdown",):
121
- raise HTTPException(
122
- status_code=400,
123
- detail="Only 'markdown' output_format is supported",
124
- )
 
125
 
126
- # Validate file size
127
  file.file.seek(0, 2)
128
  file_size = file.file.tell()
129
  file.file.seek(0)
130
-
131
  file_size_mb = file_size / (1024 * 1024)
132
  logger.info(f"[{request_id}] File size: {file_size_mb:.2f} MB")
133
 
@@ -137,20 +132,18 @@ async def parse_document(
137
  detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
138
  )
139
 
140
- # Validate file type
141
  allowed_extensions = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}
142
  file_ext = Path(file.filename).suffix.lower() if file.filename else ""
143
  if file_ext not in allowed_extensions:
144
  raise HTTPException(
145
  status_code=400,
146
- detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}",
147
  )
148
 
149
  logger.info(f"[{request_id}] Model: {QWEN_MODEL}")
150
- logger.info(f"[{request_id}] Page range: {start_page} to {end_page or 'end'}")
151
 
152
  temp_dir = tempfile.mkdtemp()
153
-
154
  try:
155
  input_path = Path(temp_dir) / f"input{file_ext}"
156
  await asyncio.to_thread(_save_uploaded_file, input_path, file.file)
@@ -173,35 +166,22 @@ async def parse_document(
173
  images_zip, image_count = _create_images_zip(output_dir)
174
 
175
  total_duration = time.time() - start_time
176
- logger.info(f"[{request_id}] {'='*50}")
177
- logger.info(f"[{request_id}] Request completed successfully")
178
- logger.info(f"[{request_id}] Pages processed: {pages_processed}")
179
- logger.info(f"[{request_id}] Total time: {total_duration:.2f}s")
180
- if pages_processed > 0:
181
- logger.info(f"[{request_id}] Speed: {pages_processed / total_duration:.2f} pages/sec")
182
- logger.info(f"[{request_id}] {'='*50}")
183
 
184
  return ParseResponse(
185
  success=True,
186
- markdown=markdown_content if output_format == "markdown" else None,
187
- json_content=json_content if output_format == "json" else None,
188
  images_zip=images_zip,
189
  image_count=image_count,
190
  pages_processed=pages_processed,
191
  device_used="gpu",
192
  vlm_model=QWEN_MODEL,
193
  )
194
-
195
  except Exception as e:
196
  total_duration = time.time() - start_time
197
- logger.error(f"[{request_id}] {'='*50}")
198
- logger.error(f"[{request_id}] Request failed after {total_duration:.2f}s")
199
- logger.error(f"[{request_id}] Error: {type(e).__name__}: {str(e)}", exc_info=True)
200
- logger.error(f"[{request_id}] {'='*50}")
201
- return ParseResponse(
202
- success=False,
203
- error=f"Processing failed (ref: {request_id})",
204
- )
205
  finally:
206
  shutil.rmtree(temp_dir, ignore_errors=True)
207
 
@@ -211,43 +191,37 @@ async def parse_document_from_url(
211
  request: URLParseRequest,
212
  _token: str = Depends(verify_token),
213
  ) -> ParseResponse:
214
- """Parse a document from a URL using Qwen3-VL."""
215
  request_id = str(uuid4())[:8]
216
  start_time = time.time()
217
 
218
- logger.info(f"[{request_id}] {'='*50}")
219
  logger.info(f"[{request_id}] New URL parse request received")
220
  logger.info(f"[{request_id}] URL: {request.url}")
221
- logger.info(f"[{request_id}] Output format: {request.output_format}")
222
 
223
- if request.output_format not in ("markdown",):
224
- raise HTTPException(
225
- status_code=400,
226
- detail="Only 'markdown' output_format is supported",
227
- )
 
228
 
229
  _validate_url(request.url)
230
 
231
  temp_dir = tempfile.mkdtemp()
232
-
233
  try:
234
- # Download file
235
- logger.info(f"[{request_id}] Downloading file from URL...")
236
- download_start = time.time()
237
  async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
238
  response = await client.get(request.url)
239
  response.raise_for_status()
240
 
241
- file_size_mb = len(response.content) / (1024 * 1024)
242
- logger.info(
243
- f"[{request_id}] Download completed in {time.time() - download_start:.2f}s "
244
- f"({file_size_mb:.2f} MB)"
245
- )
246
 
247
- # Determine file extension (with Content-Type fallback)
248
  url_path = Path(request.url.split("?")[0])
249
  file_ext = url_path.suffix.lower()
250
-
251
  if not file_ext or file_ext not in {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}:
252
  content_type = response.headers.get("content-type", "").lower()
253
  ct_map = {
@@ -259,23 +233,12 @@ async def parse_document_from_url(
259
  }
260
  file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf")
261
 
262
- if len(response.content) > MAX_FILE_SIZE_BYTES:
263
- raise HTTPException(
264
- status_code=413,
265
- detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
266
- )
267
-
268
  input_path = Path(temp_dir) / f"input{file_ext}"
269
  await asyncio.to_thread(_save_downloaded_content, input_path, response.content)
270
 
271
  output_dir = Path(temp_dir) / "output"
272
  output_dir.mkdir(exist_ok=True)
273
 
274
- logger.info(f"[{request_id}] Model: {QWEN_MODEL}")
275
- logger.info(
276
- f"[{request_id}] Page range: {request.start_page} to {request.end_page or 'end'}"
277
- )
278
-
279
  markdown_content, json_content, pages_processed, image_count = await asyncio.to_thread(
280
  _convert_document,
281
  input_path,
@@ -291,42 +254,25 @@ async def parse_document_from_url(
291
  images_zip, image_count = _create_images_zip(output_dir)
292
 
293
  total_duration = time.time() - start_time
294
- logger.info(f"[{request_id}] {'='*50}")
295
- logger.info(f"[{request_id}] Request completed successfully")
296
- logger.info(f"[{request_id}] Pages processed: {pages_processed}")
297
- logger.info(f"[{request_id}] Total time: {total_duration:.2f}s")
298
- if pages_processed > 0:
299
- logger.info(f"[{request_id}] Speed: {pages_processed / total_duration:.2f} pages/sec")
300
- logger.info(f"[{request_id}] {'='*50}")
301
 
302
  return ParseResponse(
303
  success=True,
304
- markdown=markdown_content if request.output_format == "markdown" else None,
305
- json_content=json_content if request.output_format == "json" else None,
306
  images_zip=images_zip,
307
  image_count=image_count,
308
  pages_processed=pages_processed,
309
  device_used="gpu",
310
  vlm_model=QWEN_MODEL,
311
  )
312
-
313
  except httpx.HTTPError as e:
314
- total_duration = time.time() - start_time
315
- logger.error(f"[{request_id}] Download failed after {total_duration:.2f}s: {str(e)}")
316
- return ParseResponse(
317
- success=False,
318
- error=f"Failed to download file from URL (ref: {request_id})",
319
- )
320
  except Exception as e:
321
  total_duration = time.time() - start_time
322
- logger.error(f"[{request_id}] {'='*50}")
323
- logger.error(f"[{request_id}] Request failed after {total_duration:.2f}s")
324
- logger.error(f"[{request_id}] Error: {type(e).__name__}: {str(e)}", exc_info=True)
325
- logger.error(f"[{request_id}] {'='*50}")
326
- return ParseResponse(
327
- success=False,
328
- error=f"Processing failed (ref: {request_id})",
329
- )
330
  finally:
331
  shutil.rmtree(temp_dir, ignore_errors=True)
332
 
 
1
+ """Unsloth Qwen3-VL GGUF parser API."""
2
 
3
  import asyncio
4
  import re
 
16
  from auth import _validate_url, verify_token
17
  from config import (
18
  IMAGES_SCALE,
19
+ LLAMA_CTX_SIZE,
20
+ LLAMA_FLASH_ATTN,
21
+ LLAMA_GPU_LAYERS,
22
+ LLAMA_HF_FILE,
23
+ LLAMA_HF_REPO,
24
+ LLAMA_MAX_TOKENS,
25
+ LLAMA_MMPROJ_FILE,
26
+ LLAMA_SERVER_TIMEOUT,
27
+ LLAMA_SERVER_URL,
28
+ LLAMA_THREADS,
29
  MAX_FILE_SIZE_BYTES,
30
  MAX_FILE_SIZE_MB,
 
 
 
 
31
  QWEN_MODEL,
 
32
  RENDER_DPI,
33
  logger,
34
  )
 
42
  )
43
 
44
 
 
 
 
 
 
45
  @asynccontextmanager
46
  async def lifespan(app: FastAPI):
47
+ """Startup: initialize local llama.cpp client."""
48
  logger.info("=" * 60)
49
+ logger.info("Starting Docling Parser API v5.2.0...")
50
+ logger.info("Initializing local llama.cpp client...")
51
  _get_pipeline()
52
+ logger.info("llama.cpp client ready")
53
 
54
  logger.info(f"Render DPI: {RENDER_DPI}")
55
  logger.info(f"Images scale: {IMAGES_SCALE}")
56
  logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
57
  logger.info(f"Qwen Model: {QWEN_MODEL}")
58
+ logger.info(f"llama-server URL: {LLAMA_SERVER_URL}")
59
+ logger.info(f"llama-server timeout: {LLAMA_SERVER_TIMEOUT}s")
60
+ logger.info(f"llama HF repo: {LLAMA_HF_REPO}")
61
+ logger.info(f"llama model file: {LLAMA_HF_FILE}")
62
+ logger.info(f"llama mmproj file: {LLAMA_MMPROJ_FILE}")
63
+ logger.info(f"llama max tokens: {LLAMA_MAX_TOKENS}")
64
+ logger.info(f"llama ctx size: {LLAMA_CTX_SIZE}")
65
+ logger.info(f"llama gpu layers: {LLAMA_GPU_LAYERS}")
66
+ logger.info(f"llama threads: {LLAMA_THREADS}")
67
+ logger.info(f"llama flash attention: {LLAMA_FLASH_ATTN}")
68
 
69
  logger.info("=" * 60)
70
+ logger.info("Docling Parser API ready (Unsloth GGUF via llama.cpp)")
71
  logger.info("=" * 60)
72
  yield
73
+ logger.info("Shutting down Docling Parser API...")
 
74
 
 
 
 
75
 
76
  app = FastAPI(
77
+ title="Docling Parser API",
78
+ description="Unsloth Qwen3-VL GGUF local parser",
79
+ version="5.2.0",
80
  lifespan=lifespan,
81
  )
82
 
83
 
 
 
 
 
 
84
  @app.get("/", response_model=HealthResponse)
85
  async def health_check() -> HealthResponse:
86
  """Health check endpoint."""
87
  return HealthResponse(
88
  status="healthy",
89
+ version="5.2.0",
90
+ model="Qwen3-VL-8B-Instruct GGUF",
91
  gemini_status="not used",
92
  images_scale=IMAGES_SCALE,
93
  )
 
96
  @app.post("/parse", response_model=ParseResponse)
97
  async def parse_document(
98
  file: UploadFile = File(..., description="PDF or image file to parse"),
99
+ output_format: str = Form(default="markdown", description="Output format: markdown only"),
100
+ images_scale: Optional[float] = Form(default=None, description="Reserved for compatibility"),
101
  start_page: int = Form(default=0, description="Starting page (0-indexed)"),
102
  end_page: Optional[int] = Form(default=None, description="Ending page (None = all pages)"),
103
  include_images: bool = Form(default=False, description="Include extracted images"),
104
  _token: str = Depends(verify_token),
105
  ) -> ParseResponse:
106
+ """Parse a document file using local llama.cpp + Unsloth GGUF."""
107
  request_id = str(uuid4())[:8]
108
  start_time = time.time()
109
 
110
+ logger.info(f"[{request_id}] {'=' * 50}")
111
  logger.info(f"[{request_id}] New parse request received")
112
+ safe_filename = re.sub(r"[\r\n\t\x00-\x1f\x7f]", "_", file.filename or "")[:255]
113
  logger.info(f"[{request_id}] Filename: {safe_filename}")
114
  logger.info(f"[{request_id}] Output format: {output_format}")
115
 
116
+ if output_format != "markdown":
117
+ raise HTTPException(status_code=400, detail="Only 'markdown' output_format is supported")
118
+ if start_page < 0:
119
+ raise HTTPException(status_code=400, detail="start_page must be >= 0")
120
+ if end_page is not None and end_page < start_page:
121
+ raise HTTPException(status_code=400, detail="end_page must be >= start_page")
122
 
 
123
  file.file.seek(0, 2)
124
  file_size = file.file.tell()
125
  file.file.seek(0)
 
126
  file_size_mb = file_size / (1024 * 1024)
127
  logger.info(f"[{request_id}] File size: {file_size_mb:.2f} MB")
128
 
 
132
  detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
133
  )
134
 
 
135
  allowed_extensions = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}
136
  file_ext = Path(file.filename).suffix.lower() if file.filename else ""
137
  if file_ext not in allowed_extensions:
138
  raise HTTPException(
139
  status_code=400,
140
+ detail=f"Unsupported file type. Allowed: {', '.join(sorted(allowed_extensions))}",
141
  )
142
 
143
  logger.info(f"[{request_id}] Model: {QWEN_MODEL}")
144
+ logger.info(f"[{request_id}] Page range: {start_page} to {end_page if end_page is not None else 'end'}")
145
 
146
  temp_dir = tempfile.mkdtemp()
 
147
  try:
148
  input_path = Path(temp_dir) / f"input{file_ext}"
149
  await asyncio.to_thread(_save_uploaded_file, input_path, file.file)
 
166
  images_zip, image_count = _create_images_zip(output_dir)
167
 
168
  total_duration = time.time() - start_time
169
+ logger.info(f"[{request_id}] Request completed successfully in {total_duration:.2f}s")
 
 
 
 
 
 
170
 
171
  return ParseResponse(
172
  success=True,
173
+ markdown=markdown_content,
174
+ json_content=json_content,
175
  images_zip=images_zip,
176
  image_count=image_count,
177
  pages_processed=pages_processed,
178
  device_used="gpu",
179
  vlm_model=QWEN_MODEL,
180
  )
 
181
  except Exception as e:
182
  total_duration = time.time() - start_time
183
+ logger.error(f"[{request_id}] Request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True)
184
+ return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})")
 
 
 
 
 
 
185
  finally:
186
  shutil.rmtree(temp_dir, ignore_errors=True)
187
 
 
191
  request: URLParseRequest,
192
  _token: str = Depends(verify_token),
193
  ) -> ParseResponse:
194
+ """Parse a document from URL using local llama.cpp + Unsloth GGUF."""
195
  request_id = str(uuid4())[:8]
196
  start_time = time.time()
197
 
198
+ logger.info(f"[{request_id}] {'=' * 50}")
199
  logger.info(f"[{request_id}] New URL parse request received")
200
  logger.info(f"[{request_id}] URL: {request.url}")
 
201
 
202
+ if request.output_format != "markdown":
203
+ raise HTTPException(status_code=400, detail="Only 'markdown' output_format is supported")
204
+ if request.start_page < 0:
205
+ raise HTTPException(status_code=400, detail="start_page must be >= 0")
206
+ if request.end_page is not None and request.end_page < request.start_page:
207
+ raise HTTPException(status_code=400, detail="end_page must be >= start_page")
208
 
209
  _validate_url(request.url)
210
 
211
  temp_dir = tempfile.mkdtemp()
 
212
  try:
 
 
 
213
  async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
214
  response = await client.get(request.url)
215
  response.raise_for_status()
216
 
217
+ if len(response.content) > MAX_FILE_SIZE_BYTES:
218
+ raise HTTPException(
219
+ status_code=413,
220
+ detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
221
+ )
222
 
 
223
  url_path = Path(request.url.split("?")[0])
224
  file_ext = url_path.suffix.lower()
 
225
  if not file_ext or file_ext not in {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}:
226
  content_type = response.headers.get("content-type", "").lower()
227
  ct_map = {
 
233
  }
234
  file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf")
235
 
 
 
 
 
 
 
236
  input_path = Path(temp_dir) / f"input{file_ext}"
237
  await asyncio.to_thread(_save_downloaded_content, input_path, response.content)
238
 
239
  output_dir = Path(temp_dir) / "output"
240
  output_dir.mkdir(exist_ok=True)
241
 
 
 
 
 
 
242
  markdown_content, json_content, pages_processed, image_count = await asyncio.to_thread(
243
  _convert_document,
244
  input_path,
 
254
  images_zip, image_count = _create_images_zip(output_dir)
255
 
256
  total_duration = time.time() - start_time
257
+ logger.info(f"[{request_id}] URL request completed successfully in {total_duration:.2f}s")
 
 
 
 
 
 
258
 
259
  return ParseResponse(
260
  success=True,
261
+ markdown=markdown_content,
262
+ json_content=json_content,
263
  images_zip=images_zip,
264
  image_count=image_count,
265
  pages_processed=pages_processed,
266
  device_used="gpu",
267
  vlm_model=QWEN_MODEL,
268
  )
 
269
  except httpx.HTTPError as e:
270
+ logger.error(f"[{request_id}] Download failed: {e}")
271
+ return ParseResponse(success=False, error=f"Failed to download file from URL (ref: {request_id})")
 
 
 
 
272
  except Exception as e:
273
  total_duration = time.time() - start_time
274
+ logger.error(f"[{request_id}] URL request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True)
275
+ return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})")
 
 
 
 
 
 
276
  finally:
277
  shutil.rmtree(temp_dir, ignore_errors=True)
278
 
config.py CHANGED
@@ -1,9 +1,8 @@
1
- """Configuration, environment variables, and logging setup for the Qwen parser."""
2
 
3
  import logging
4
  import os
5
 
6
- # Configure logging
7
  logging.basicConfig(
8
  level=logging.INFO,
9
  format="%(asctime)s | %(levelname)-8s | %(message)s",
@@ -11,23 +10,25 @@ logging.basicConfig(
11
  )
12
  logger = logging.getLogger("docling-parser")
13
 
14
- # Security
15
  API_TOKEN = os.getenv("API_TOKEN")
16
 
17
- # Configuration
18
  IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
19
  MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
20
  MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
21
  RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
22
 
23
- QWEN_MODEL = os.getenv("QWEN_MODEL", "Qwen/Qwen3-VL-8B-Instruct")
24
- QWEN_MAX_NEW_TOKENS = int(os.getenv("QWEN_MAX_NEW_TOKENS", "1536"))
25
- QWEN_BATCH_SIZE = int(os.getenv("QWEN_BATCH_SIZE", "2"))
26
- QWEN_IMAGE_MAX_SIDE = int(os.getenv("QWEN_IMAGE_MAX_SIDE", "1536"))
27
- QWEN_ATTN_IMPLEMENTATION = os.getenv("QWEN_ATTN_IMPLEMENTATION", "flash_attention_2")
28
- QWEN_TORCH_DTYPE = os.getenv("QWEN_TORCH_DTYPE", "bfloat16")
 
 
 
 
 
29
 
30
- # Blocked hostnames for SSRF protection
31
  BLOCKED_HOSTNAMES = {
32
  "localhost",
33
  "metadata",
 
1
+ """Configuration, environment variables, and logging setup for the Unsloth Qwen parser."""
2
 
3
  import logging
4
  import os
5
 
 
6
  logging.basicConfig(
7
  level=logging.INFO,
8
  format="%(asctime)s | %(levelname)-8s | %(message)s",
 
10
  )
11
  logger = logging.getLogger("docling-parser")
12
 
 
13
  API_TOKEN = os.getenv("API_TOKEN")
14
 
 
15
  IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
16
  MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
17
  MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
18
  RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
19
 
20
+ QWEN_MODEL = os.getenv("QWEN_MODEL", "unsloth/Qwen3-VL-8B-Instruct-GGUF")
21
+ LLAMA_SERVER_URL = os.getenv("LLAMA_SERVER_URL", "http://127.0.0.1:8080")
22
+ LLAMA_SERVER_TIMEOUT = float(os.getenv("LLAMA_SERVER_TIMEOUT", "300"))
23
+ LLAMA_MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "1536"))
24
+ LLAMA_HF_REPO = os.getenv("LLAMA_HF_REPO", "unsloth/Qwen3-VL-8B-Instruct-GGUF")
25
+ LLAMA_HF_FILE = os.getenv("LLAMA_HF_FILE", "Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf")
26
+ LLAMA_MMPROJ_FILE = os.getenv("LLAMA_MMPROJ_FILE", "mmproj-F16.gguf")
27
+ LLAMA_CTX_SIZE = int(os.getenv("LLAMA_CTX_SIZE", "8192"))
28
+ LLAMA_GPU_LAYERS = int(os.getenv("LLAMA_GPU_LAYERS", "99"))
29
+ LLAMA_THREADS = int(os.getenv("LLAMA_THREADS", "8"))
30
+ LLAMA_FLASH_ATTN = os.getenv("LLAMA_FLASH_ATTN", "on")
31
 
 
32
  BLOCKED_HOSTNAMES = {
33
  "localhost",
34
  "metadata",
pipeline.py CHANGED
@@ -1,4 +1,4 @@
1
- """Qwen3-VL pipeline, page rendering, and file helpers."""
2
 
3
  import base64
4
  import io
@@ -7,25 +7,18 @@ import zipfile
7
  from pathlib import Path
8
  from typing import BinaryIO, Optional
9
 
10
- import torch
11
- from PIL import Image
12
- from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
13
 
14
  from config import (
15
- QWEN_ATTN_IMPLEMENTATION,
16
- QWEN_BATCH_SIZE,
17
- QWEN_IMAGE_MAX_SIDE,
18
- QWEN_MAX_NEW_TOKENS,
19
  QWEN_MODEL,
20
- QWEN_TORCH_DTYPE,
21
  logger,
22
  )
23
  from postprocess import _post_process_merged_markdown
24
  from rendering import _image_file_to_png_bytes, _pdf_to_page_images
25
 
26
- _model = None
27
- _processor = None
28
-
29
  _OCR_PROMPT = (
30
  "Convert this document page to clean markdown.\n\n"
31
  "Rules:\n"
@@ -39,50 +32,9 @@ _OCR_PROMPT = (
39
  )
40
 
41
 
42
- def _resolve_torch_dtype() -> torch.dtype | str:
43
- """Resolve configured dtype to a torch dtype when possible."""
44
- dtype_map = {
45
- "auto": "auto",
46
- "bfloat16": torch.bfloat16,
47
- "float16": torch.float16,
48
- "float32": torch.float32,
49
- }
50
- return dtype_map.get(QWEN_TORCH_DTYPE.lower(), "auto")
51
-
52
-
53
- def _get_pipeline() -> tuple[Qwen3VLForConditionalGeneration, AutoProcessor]:
54
- """Get or create the global Qwen3-VL pipeline."""
55
- global _model, _processor
56
- if _model is None or _processor is None:
57
- logger.info(f"Loading Qwen model: {QWEN_MODEL}")
58
- _processor = AutoProcessor.from_pretrained(QWEN_MODEL, trust_remote_code=True)
59
- model_kwargs = {
60
- "torch_dtype": _resolve_torch_dtype(),
61
- "device_map": "auto",
62
- "trust_remote_code": True,
63
- }
64
- if QWEN_ATTN_IMPLEMENTATION and QWEN_ATTN_IMPLEMENTATION.lower() != "none":
65
- model_kwargs["attn_implementation"] = QWEN_ATTN_IMPLEMENTATION
66
- try:
67
- _model = Qwen3VLForConditionalGeneration.from_pretrained(
68
- QWEN_MODEL,
69
- **model_kwargs,
70
- )
71
- except Exception as e:
72
- if "attn_implementation" in model_kwargs:
73
- logger.warning(
74
- f"Failed to load Qwen with attn_implementation={QWEN_ATTN_IMPLEMENTATION}: {e}. "
75
- "Retrying without custom attention."
76
- )
77
- model_kwargs.pop("attn_implementation", None)
78
- _model = Qwen3VLForConditionalGeneration.from_pretrained(
79
- QWEN_MODEL,
80
- **model_kwargs,
81
- )
82
- else:
83
- raise
84
- _model.eval()
85
- return _model, _processor
86
 
87
 
88
  def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
@@ -117,98 +69,6 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
117
  return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
118
 
119
 
120
- def _resize_image(image: Image.Image) -> Image.Image:
121
- """Downscale images to reduce visual token count and generation latency."""
122
- max_side = max(image.size)
123
- if max_side <= QWEN_IMAGE_MAX_SIDE:
124
- return image
125
-
126
- scale = QWEN_IMAGE_MAX_SIDE / max_side
127
- new_size = (
128
- max(1, int(image.size[0] * scale)),
129
- max(1, int(image.size[1] * scale)),
130
- )
131
- return image.resize(new_size, Image.Resampling.LANCZOS)
132
-
133
-
134
- def _extract_markdown_from_images(
135
- page_images: list[tuple[int, bytes]],
136
- request_id: str,
137
- ) -> dict[int, str]:
138
- """Run a batch of page images through Qwen3-VL."""
139
- model, processor = _get_pipeline()
140
- prompt_texts: list[str] = []
141
- images: list[Image.Image] = []
142
- page_indices: list[int] = []
143
-
144
- for page_idx, image_bytes in page_images:
145
- image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
146
- image = _resize_image(image)
147
- messages = [
148
- {
149
- "role": "user",
150
- "content": [
151
- {"type": "image", "image": image},
152
- {"type": "text", "text": _OCR_PROMPT},
153
- ],
154
- }
155
- ]
156
- prompt_texts.append(
157
- processor.apply_chat_template(
158
- messages,
159
- tokenize=False,
160
- add_generation_prompt=True,
161
- )
162
- )
163
- images.append(image)
164
- page_indices.append(page_idx)
165
-
166
- inputs = processor(
167
- text=prompt_texts,
168
- images=images,
169
- padding=True,
170
- return_tensors="pt",
171
- )
172
-
173
- device = next(model.parameters()).device
174
- model_inputs = {
175
- key: value.to(device) if hasattr(value, "to") else value
176
- for key, value in inputs.items()
177
- }
178
-
179
- with torch.inference_mode():
180
- generated_ids = model.generate(
181
- **model_inputs,
182
- max_new_tokens=QWEN_MAX_NEW_TOKENS,
183
- do_sample=False,
184
- )
185
-
186
- input_lengths = model_inputs["attention_mask"].sum(dim=1).tolist()
187
- decoded_pages: dict[int, str] = {}
188
- for row_idx, prompt_length in enumerate(input_lengths):
189
- output_ids = generated_ids[row_idx : row_idx + 1, int(prompt_length) :]
190
- text = processor.batch_decode(
191
- output_ids,
192
- skip_special_tokens=True,
193
- clean_up_tokenization_spaces=False,
194
- )[0].strip()
195
- page_idx = page_indices[row_idx]
196
- decoded_pages[page_idx] = text
197
- logger.info(f"[{request_id}:page:{page_idx + 1}] Qwen generated {len(text)} chars")
198
-
199
- return decoded_pages
200
-
201
-
202
- def _extract_markdown_from_image(
203
- image_bytes: bytes,
204
- page_label: str,
205
- ) -> str:
206
- """Backwards-compatible single-image wrapper."""
207
- page_idx = 0
208
- page_map = _extract_markdown_from_images([(page_idx, image_bytes)], page_label)
209
- return page_map[page_idx]
210
-
211
-
212
  def _collect_page_images(
213
  input_path: Path,
214
  request_id: str,
@@ -228,6 +88,39 @@ def _collect_page_images(
228
  return [(0, _image_file_to_png_bytes(input_path))]
229
 
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  def _convert_document(
232
  input_path: Path,
233
  output_dir: Path,
@@ -236,17 +129,15 @@ def _convert_document(
236
  start_page: int = 0,
237
  end_page: Optional[int] = None,
238
  ) -> tuple:
239
- """Render pages and parse them with Qwen3-VL."""
240
  page_images = _collect_page_images(input_path, request_id, start_page, end_page)
241
  if not page_images:
242
  raise ValueError("No pages available to parse")
243
 
244
  markdown_pages: list[str] = []
245
- for batch_start in range(0, len(page_images), QWEN_BATCH_SIZE):
246
- batch = page_images[batch_start : batch_start + QWEN_BATCH_SIZE]
247
- batch_outputs = _extract_markdown_from_images(batch, request_id)
248
- for page_idx, _ in batch:
249
- markdown_pages.append(batch_outputs.get(page_idx, ""))
250
 
251
  markdown_content = "\n\n".join(p for p in markdown_pages if p).strip()
252
  markdown_content = _post_process_merged_markdown(markdown_content)
 
1
+ """Unsloth GGUF Qwen3-VL pipeline and file helpers."""
2
 
3
  import base64
4
  import io
 
7
  from pathlib import Path
8
  from typing import BinaryIO, Optional
9
 
10
+ import httpx
 
 
11
 
12
  from config import (
13
+ LLAMA_MAX_TOKENS,
14
+ LLAMA_SERVER_TIMEOUT,
15
+ LLAMA_SERVER_URL,
 
16
  QWEN_MODEL,
 
17
  logger,
18
  )
19
  from postprocess import _post_process_merged_markdown
20
  from rendering import _image_file_to_png_bytes, _pdf_to_page_images
21
 
 
 
 
22
  _OCR_PROMPT = (
23
  "Convert this document page to clean markdown.\n\n"
24
  "Rules:\n"
 
32
  )
33
 
34
 
35
+ def _get_pipeline() -> str:
36
+ """Compatibility helper for app startup."""
37
+ return LLAMA_SERVER_URL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
  def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
 
69
  return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
70
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def _collect_page_images(
73
  input_path: Path,
74
  request_id: str,
 
88
  return [(0, _image_file_to_png_bytes(input_path))]
89
 
90
 
91
+ def _call_llama_server(image_bytes: bytes, page_label: str) -> str:
92
+ """Send a page image to the local llama.cpp OpenAI-compatible server."""
93
+ image_b64 = base64.b64encode(image_bytes).decode("utf-8")
94
+ payload = {
95
+ "model": QWEN_MODEL,
96
+ "messages": [
97
+ {
98
+ "role": "user",
99
+ "content": [
100
+ {"type": "text", "text": _OCR_PROMPT},
101
+ {
102
+ "type": "image_url",
103
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
104
+ },
105
+ ],
106
+ }
107
+ ],
108
+ "temperature": 0.0,
109
+ "max_tokens": LLAMA_MAX_TOKENS,
110
+ }
111
+
112
+ response = httpx.post(
113
+ f"{LLAMA_SERVER_URL}/v1/chat/completions",
114
+ json=payload,
115
+ timeout=LLAMA_SERVER_TIMEOUT,
116
+ )
117
+ response.raise_for_status()
118
+ data = response.json()
119
+ text = data.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
120
+ logger.info(f"[{page_label}] llama-server generated {len(text)} chars")
121
+ return text
122
+
123
+
124
  def _convert_document(
125
  input_path: Path,
126
  output_dir: Path,
 
129
  start_page: int = 0,
130
  end_page: Optional[int] = None,
131
  ) -> tuple:
132
+ """Render pages and parse them with the local Unsloth GGUF server."""
133
  page_images = _collect_page_images(input_path, request_id, start_page, end_page)
134
  if not page_images:
135
  raise ValueError("No pages available to parse")
136
 
137
  markdown_pages: list[str] = []
138
+ for page_idx, image_bytes in page_images:
139
+ page_label = f"{request_id}:page:{page_idx + 1}"
140
+ markdown_pages.append(_call_llama_server(image_bytes, page_label))
 
 
141
 
142
  markdown_content = "\n\n".join(p for p in markdown_pages if p).strip()
143
  markdown_content = _post_process_merged_markdown(markdown_content)
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- # Qwen3-VL parser API dependencies
2
  fastapi>=0.115.0
3
  uvicorn[standard]>=0.32.0
4
  python-multipart>=0.0.9
@@ -7,8 +7,3 @@ pydantic>=2.0.0
7
  opencv-python-headless>=4.10.0
8
  pdf2image>=1.17.0
9
  huggingface-hub>=0.25.0
10
- Pillow>=10.0.0
11
- accelerate>=0.34.0
12
- torch>=2.4.0
13
- torchvision>=0.19.0
14
- transformers @ git+https://github.com/huggingface/transformers.git
 
1
+ # Unsloth GGUF Qwen3-VL parser API dependencies
2
  fastapi>=0.115.0
3
  uvicorn[standard]>=0.32.0
4
  python-multipart>=0.0.9
 
7
  opencv-python-headless>=4.10.0
8
  pdf2image>=1.17.0
9
  huggingface-hub>=0.25.0
 
 
 
 
 
start.sh CHANGED
@@ -1,7 +1,15 @@
1
  #!/bin/bash
2
- # Start the PaddleOCR-VL + Gemini hybrid parser API
3
- # Single process: FastAPI with PaddleOCR-VL-1.5 loaded in-process
4
- # Note: Dockerfile should ensure this script is executable (chmod +x)
 
 
 
 
 
 
 
 
 
5
 
6
- # Start FastAPI
7
  exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
 
1
  #!/bin/bash
2
+ set -euo pipefail
3
+
4
+ /home/user/llama.cpp/build/bin/llama-server \
5
+ --host 0.0.0.0 \
6
+ --port 8080 \
7
+ --hf-repo "${LLAMA_HF_REPO}" \
8
+ --hf-file "${LLAMA_HF_FILE}" \
9
+ --mmproj "${LLAMA_MMPROJ_FILE}" \
10
+ --ctx-size "${LLAMA_CTX_SIZE}" \
11
+ --n-gpu-layers "${LLAMA_GPU_LAYERS}" \
12
+ --threads "${LLAMA_THREADS}" \
13
+ --flash-attn "${LLAMA_FLASH_ATTN}" &
14
 
 
15
  exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1