Spaces:

sfdghsdvxfbgn
/

node-2

Sleeping

App Files Files Community

node-2 / main.py

sfdghsdvxfbgn

Upload 7 files

d42d358 verified 16 days ago

Raw

History Blame Contribute Delete

39.7 kB

	"""
	MinerU OCR & Document Extraction Service
	FastAPI application for Hugging Face Docker Space (CPU / pipeline backend)

	Correct imports for magic-pdf >= 1.0.x (magic_pdf module):

	from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
	from magic_pdf.data.dataset import PymuDocDataset, ImageDataset
	from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
	from magic_pdf.config.enums import SupportedPdfParseMethod

	OBSOLETE imports (removed in magic-pdf >= 1.0, do not use):
	from magic_pdf.pipe.UNIPipe import UNIPipe ← removed
	from magic_pdf.rw.DiskReaderWriter import ... ← removed
	from magic_pdf.data.read_api import read_local_images ← NOT used here;
	function expects a single string path in 1.x but crashes with
	"stat: ... not list" if given a list. Use FileBasedDataReader instead.

	Removed features vs original:
	- DOCX / PPTX / XLSX (required LibreOffice; caused build OOM/timeout)
	- subprocess (was only used for LibreOffice conversion)
	- python-magic / libmagic (listed in requirements but never imported)

	Endpoints:
	GET /health — liveness (always fast, no dependency check)
	GET /status — full node status including memory (via cgroups), uptime,
	cache, active requests, lastModelLoadMs
	POST /extract — single file (PDF or image) with SHA256 cache + memory guard
	POST /batch — up to BATCH_MAX_FILES files; extras silently ignored

	Structured error format (all non-2xx responses from /extract and /batch):
	{
	"success": false,
	"stage": "upload" \| "validation" \| "decode" \| "ocr" \| "markdown" \| "unknown",
	"errorCode": "UNSUPPORTED_TYPE" \| "FILE_TOO_LARGE" \| "EMPTY_FILE" \|
	"LOW_MEMORY" \| "IMAGE_DECODE_FAILED" \| "OCR_PIPELINE_FAILED" \|
	"MARKDOWN_FAILED" \| "INTERNAL_ERROR",
	"message": "<human-readable detail>"
	}
	"""

	import hashlib
	import io
	import os
	import re
	import shutil
	import sys
	import tempfile
	import threading
	import time
	import traceback
	import logging
	from importlib.metadata import version as pkg_version
	from typing import Any

	import fitz # PyMuPDF — bundled with magic-pdf[full-cpu]
	from PIL import Image

	from fastapi import FastAPI, File, UploadFile, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import JSONResponse

	# ── Logging ───────────────────────────────────────────────────────────────────
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(levelname)-8s %(name)s %(message)s",
	)
	logger = logging.getLogger("mineru-service")

	# ── Start time ────────────────────────────────────────────────────────────────
	_START_TIME: float = time.time()

	# ── Upload / batch constants ──────────────────────────────────────────────────
	MAX_UPLOAD_BYTES = 30 * 1024 * 1024 # 30 MB
	BATCH_MAX_FILES = 8

	# ── Supported file types ──────────────────────────────────────────────────────
	PDF_EXTENSIONS = {"pdf"}

	# Natively supported by ImageDataset via FileBasedDataReader
	NATIVE_IMAGE_EXTENSIONS = {"jpg", "jpeg", "png"}

	# Need Pillow conversion to PNG before feeding to MinerU
	PILLOW_IMAGE_EXTENSIONS = {"webp", "bmp", "tiff", "tif", "gif", "heic", "heif", "avif"}

	IMAGE_EXTENSIONS = NATIVE_IMAGE_EXTENSIONS \| PILLOW_IMAGE_EXTENSIONS
	ALLOWED_EXTENSIONS = PDF_EXTENSIONS \| IMAGE_EXTENSIONS

	# ── Memory safety thresholds ──────────────────────────────────────────────────
	BYTES_PER_OCR_PAGE = 100 * 1024 * 1024 # ~100 MB / page (conservative)
	IMAGE_MEMORY_FACTOR = 4 # decoded pixels × 4 for pipeline buffers
	MEM_SAFETY_FLOOR_MB = 1024 # always keep 1 GB free

	# ── SHA256 extraction cache (in-process, bounded by available RAM) ────────────
	_cache: dict[str, dict[str, Any]] = {}
	_cache_lock = threading.Lock()

	# ── Active-request counter ────────────────────────────────────────────────────
	_active_requests: int = 0
	_active_lock = threading.Lock()

	# ── Model load timing ─────────────────────────────────────────────────────────
	_model_load_ms: int = 0

	# ── Startup self-test results (populated by startup handler) ──────────────────
	_startup_issues: list[str] = []
	_startup_done: bool = False


	# ─────────────────────────────────────────────────────────────────────────────
	# Structured error exception
	# ─────────────────────────────────────────────────────────────────────────────
	class ExtractionError(Exception):
	"""
	Raised anywhere in the extraction pipeline to produce a structured
	JSON error response with stage + errorCode instead of a generic 500.
	"""
	def __init__(
	self,
	stage: str,
	code: str,
	message: str,
	http_status: int = 422,
	) -> None:
	self.stage = stage
	self.code = code
	self.message = message
	self.http_status = http_status
	super().__init__(message)

	def to_dict(self) -> dict[str, Any]:
	return {
	"success": False,
	"stage": self.stage,
	"errorCode": self.code,
	"message": self.message,
	}


	def _err(stage: str, code: str, msg: str, status: int = 422) -> ExtractionError:
	"""Shorthand constructor."""
	return ExtractionError(stage, code, msg, status)


	# ─────────────────────────────────────────────────────────────────────────────
	# Active request helpers
	# ─────────────────────────────────────────────────────────────────────────────
	def _inc_active() -> None:
	global _active_requests
	with _active_lock:
	_active_requests += 1


	def _dec_active() -> None:
	global _active_requests
	with _active_lock:
	_active_requests = max(0, _active_requests - 1)


	# ─────────────────────────────────────────────────────────────────────────────
	# Pipeline readiness (lazy, first-request check)
	# ─────────────────────────────────────────────────────────────────────────────
	_pipeline_ready: bool = False
	_pipeline_lock = threading.Lock()


	def _ensure_pipeline() -> None:
	"""
	Verify MinerU is importable and its config is present.
	Sets _pipeline_ready on first success; raises ExtractionError on failure.
	Checks are done under a lock so concurrent first-requests don't race.
	"""
	global _pipeline_ready, _model_load_ms
	if _pipeline_ready:
	return

	with _pipeline_lock:
	if _pipeline_ready: # double-checked locking
	return

	config_path = os.path.expanduser("~/magic-pdf.json")
	if not os.path.exists(config_path):
	raise _err(
	"model_load", "CONFIG_MISSING",
	f"magic-pdf.json not found at {config_path}. "
	"Check Docker build log for download_models.py output.",
	503,
	)

	# Trigger a lightweight import to confirm the package is usable.
	t0 = time.perf_counter()
	try:
	from magic_pdf.data.dataset import PymuDocDataset, ImageDataset # noqa: F401
	from magic_pdf.data.data_reader_writer import ( # noqa: F401
	FileBasedDataReader, FileBasedDataWriter,
	)
	except ImportError as exc:
	raise _err(
	"model_load", "IMPORT_FAILED",
	f"magic_pdf not importable: {exc}. Check Dockerfile pip layers.",
	503,
	) from exc

	_model_load_ms = int((time.perf_counter() - t0) * 1000)
	_pipeline_ready = True
	logger.info("Pipeline ready (import check: %d ms).", _model_load_ms)


	# ─────────────────────────────────────────────────────────────────────────────
	# FastAPI app
	# ─────────────────────────────────────────────────────────────────────────────
	app = FastAPI(
	title="MinerU OCR Service",
	description=(
	"OCR and document extraction via MinerU pipeline backend (CPU). "
	"Supports PDF and image files up to 30 MB."
	),
	version="1.1.0",
	)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["GET", "POST"],
	allow_headers=["*"],
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# Startup self-test
	# ─────────────────────────────────────────────────────────────────────────────
	@app.on_event("startup")
	async def startup_self_test() -> None:
	"""
	Run at container startup. Verifies all critical dependencies are present.
	Never crashes the server — issues are stored in _startup_issues and
	surfaced via GET /status so operators can diagnose without SSH access.
	"""
	global _startup_done
	issues: list[str] = []

	# 1. cv2 — most common missing dependency
	try:
	import cv2 # noqa: F401
	logger.info("startup ✓ cv2 available (version %s)", cv2.__version__)
	except ImportError as exc:
	msg = (
	f"cv2 not importable: {exc}. "
	"Add 'opencv-python-headless>=4.8.0' to pip layer 1 in Dockerfile."
	)
	issues.append(msg)
	logger.critical("startup FAIL %s", msg)

	# 2. magic_pdf core imports
	try:
	from magic_pdf.data.dataset import PymuDocDataset, ImageDataset # noqa: F401
	from magic_pdf.data.data_reader_writer import ( # noqa: F401
	FileBasedDataReader, FileBasedDataWriter,
	)
	from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze # noqa: F401
	from magic_pdf.config.enums import SupportedPdfParseMethod # noqa: F401
	logger.info("startup ✓ magic_pdf imports OK")
	except ImportError as exc:
	msg = f"magic_pdf not importable: {exc}"
	issues.append(msg)
	logger.critical("startup FAIL %s", msg)

	# 3. MinerU config
	config_path = os.path.expanduser("~/magic-pdf.json")
	if os.path.exists(config_path):
	logger.info("startup ✓ magic-pdf.json found at %s", config_path)
	else:
	msg = f"magic-pdf.json missing at {config_path} — run download_models.py"
	issues.append(msg)
	logger.critical("startup FAIL %s", msg)

	# 4. Model files
	models_dir = "/app/models/PDF-Extract-Kit-1.0/models"
	if os.path.isdir(models_dir):
	logger.info("startup ✓ models directory found at %s", models_dir)
	else:
	msg = f"Models directory missing at {models_dir} — run download_models.py"
	issues.append(msg)
	logger.critical("startup FAIL %s", msg)

	# 5. Temp storage writable
	try:
	td = tempfile.mkdtemp(prefix="mineru_selftest_")
	shutil.rmtree(td)
	logger.info("startup ✓ temp storage writable")
	except Exception as exc:
	msg = f"Temp storage not writable: {exc}"
	issues.append(msg)
	logger.critical("startup FAIL %s", msg)

	_startup_issues.extend(issues)
	_startup_done = True

	if not issues:
	logger.info("=" * 60)
	logger.info("Startup self-test PASSED — service ready.")
	logger.info("=" * 60)
	else:
	logger.error("=" * 60)
	logger.error("Startup self-test FAILED — %d issue(s). See above.", len(issues))
	logger.error("Service will start but /extract will fail until fixed.")
	logger.error("=" * 60)


	# ─────────────────────────────────────────────────────────────────────────────
	# GET /health
	# ─────────────────────────────────────────────────────────────────────────────
	@app.get("/health")
	def health() -> dict[str, Any]:
	"""
	Liveness probe. Always returns 200 so HF Space marks the container as
	running. Use GET /status to check whether the OCR pipeline is ready.
	"""
	return {"status": "healthy"}


	# ─────────────────────────────────────────────────────────────────────────────
	# GET /status
	# ─────────────────────────────────────────────────────────────────────────────
	@app.get("/status")
	def status() -> dict[str, Any]:
	"""
	Full readiness report. Memory is read from cgroups (not /proc/meminfo)
	so the container's actual allocation is reported — not the host's RAM.
	/proc/meminfo inside a Docker container on HF shows the host machine's
	RAM (e.g. 123 GB) which is misleading. Cgroups v2 → v1 → /proc fallback.
	"""
	used_mb, total_mb = _mem_mb()
	return {
	"status": "healthy" if not _startup_issues else "degraded",
	"provider": "mineru",
	"version": _mineru_version(),
	"modelsLoaded": _pipeline_ready,
	"startupIssues": _startup_issues,
	"uptimeSeconds": int(time.time() - _START_TIME),
	"memoryUsedMB": used_mb,
	"memoryTotalMB": total_mb,
	"activeRequests": _active_requests,
	"cacheEntries": len(_cache),
	"lastModelLoadMs": _model_load_ms,
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# POST /extract — single file
	# ─────────────────────────────────────────────────────────────────────────────
	@app.post("/extract")
	async def extract(file: UploadFile = File(...)) -> JSONResponse:
	try:
	_ensure_pipeline()
	raw, filename, ext = await _read_upload(file)
	result = _run_extraction(raw, filename, ext)
	return JSONResponse(content=result)
	except ExtractionError as exc:
	logger.warning(
	"/extract structured error [%s/%s]: %s",
	exc.stage, exc.code, exc.message,
	)
	return JSONResponse(status_code=exc.http_status, content=exc.to_dict())
	except HTTPException:
	raise
	except Exception as exc:
	logger.exception("/extract unhandled error")
	return JSONResponse(
	status_code=500,
	content={
	"success": False,
	"stage": "unknown",
	"errorCode": "INTERNAL_ERROR",
	"message": str(exc),
	"traceback": traceback.format_exc()[-2000:],
	},
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# POST /batch — up to 8 files; extras silently ignored
	# ─────────────────────────────────────────────────────────────────────────────
	@app.post("/batch")
	async def batch(files: list[UploadFile] = File(...)) -> JSONResponse:
	"""
	Policy:
	- 1–8 files → process all.
	- > 8 files → silently process only files[0:8].
	Sequential processing to stay within CPU Basic memory limits.
	Per-file failures use the structured error format; one failure never
	aborts the rest of the batch.
	"""
	try:
	_ensure_pipeline()
	except ExtractionError as exc:
	return JSONResponse(status_code=exc.http_status, content=exc.to_dict())

	candidates = files[:BATCH_MAX_FILES]
	results: list[dict[str, Any]] = []

	for upload in candidates:
	try:
	raw, filename, ext = await _read_upload(upload)
	result = _run_extraction(raw, filename, ext)
	except ExtractionError as exc:
	result = exc.to_dict()
	result["filename"] = _sanitize_filename(upload.filename or "upload")
	except Exception as exc:
	fname = _sanitize_filename(upload.filename or "upload")
	logger.exception("Batch item failed: %s", fname)
	result = {
	"success": False,
	"filename": fname,
	"stage": "unknown",
	"errorCode": "INTERNAL_ERROR",
	"message": str(exc),
	}
	results.append(result)

	return JSONResponse(content={
	"success": True,
	"processed": len(results),
	"results": results,
	})


	# ─────────────────────────────────────────────────────────────────────────────
	# Upload reader (shared by /extract and /batch)
	# ─────────────────────────────────────────────────────────────────────────────
	async def _read_upload(upload: UploadFile) -> tuple[bytes, str, str]:
	"""Validate and read one upload. Returns (raw_bytes, filename, ext)."""
	filename = _sanitize_filename(upload.filename or "upload")
	ext = filename.rsplit(".", 1)[-1].lower() if "." in filename else ""

	if ext not in ALLOWED_EXTENSIONS:
	raise _err(
	"validation", "UNSUPPORTED_TYPE",
	f"Unsupported file type '.{ext}'. "
	f"Supported: {sorted(ALLOWED_EXTENSIONS)}",
	415,
	)

	raw = await upload.read(MAX_UPLOAD_BYTES + 1)

	if len(raw) > MAX_UPLOAD_BYTES:
	raise _err(
	"upload", "FILE_TOO_LARGE",
	f"'{filename}' exceeds the {MAX_UPLOAD_BYTES // 1024 // 1024} MB limit.",
	413,
	)
	if len(raw) == 0:
	raise _err("upload", "EMPTY_FILE", f"'{filename}' is empty.", 400)

	return raw, filename, ext


	# ─────────────────────────────────────────────────────────────────────────────
	# Extraction dispatcher (shared by /extract and /batch)
	# ─────────────────────────────────────────────────────────────────────────────
	def _run_extraction(raw: bytes, filename: str, ext: str) -> dict[str, Any]:
	"""
	1. SHA256 cache lookup → return immediately on hit (cached: true)
	2. Memory safety guard → raise ExtractionError(LOW_MEMORY) if OOM likely
	3. Dispatch to PDF or image processor
	4. Cache successful result
	5. Return with timing metadata
	"""
	# ── SHA256 cache ──────────────────────────────────────────────────────────
	file_hash = hashlib.sha256(raw).hexdigest()
	with _cache_lock:
	cached = _cache.get(file_hash)
	if cached is not None:
	logger.info("Cache hit %s sha256=%.12s…", filename, file_hash)
	result = {**cached}
	result["metadata"] = {**cached["metadata"], "cached": True, "processingTimeMs": 0}
	return result

	# ── Memory safety guard ───────────────────────────────────────────────────
	_assert_memory_safe(raw, ext)

	# ── Process ───────────────────────────────────────────────────────────────
	_inc_active()
	work_dir = tempfile.mkdtemp(prefix="mineru_")
	try:
	t0 = time.perf_counter()

	if ext in PDF_EXTENSIONS:
	result = _process_pdf(raw, filename, work_dir)
	else:
	result = _process_image(raw, filename, ext, work_dir)

	elapsed_ms = int((time.perf_counter() - t0) * 1000)
	result["metadata"] = {
	**result["metadata"],
	"processingTimeMs": elapsed_ms,
	"cached": False,
	}

	# Store without timing so cache entries stay lean
	entry = {**result, "metadata": {k: v for k, v in result["metadata"].items()
	if k not in ("processingTimeMs", "cached")}}
	with _cache_lock:
	_cache[file_hash] = entry

	return result

	except ExtractionError:
	raise
	except Exception as exc:
	logger.exception("Extraction failed for %s", filename)
	raise _err(
	"unknown", "INTERNAL_ERROR",
	f"Unexpected error during extraction: {exc}",
	500,
	) from exc
	finally:
	_dec_active()
	shutil.rmtree(work_dir, ignore_errors=True)


	# ─────────────────────────────────────────────────────────────────────────────
	# Memory safety guard
	# ─────────────────────────────────────────────────────────────────────────────
	def _assert_memory_safe(raw: bytes, ext: str) -> None:
	"""
	Estimate peak memory the pipeline needs and reject with LOW_MEMORY if
	available would drop below MEM_SAFETY_FLOOR_MB.
	"""
	used_mb, total_mb = _mem_mb()
	if total_mb == 0:
	return # cgroups and /proc both unavailable — skip guard

	available_mb = total_mb - used_mb

	if ext in PDF_EXTENSIONS:
	page_count = max(1, _pdf_page_count(raw))
	estimated_mb = (page_count * BYTES_PER_OCR_PAGE) // (1024 * 1024)
	else:
	estimated_mb = _image_memory_estimate(raw, ext) // (1024 * 1024)

	free_after = available_mb - estimated_mb
	logger.info(
	"Memory check: avail=%d MB est=%d MB free_after=%d MB",
	available_mb, estimated_mb, free_after,
	)

	if free_after < MEM_SAFETY_FLOOR_MB:
	raise _err(
	"validation", "LOW_MEMORY",
	f"Insufficient memory. "
	f"Available: {available_mb} MB, "
	f"Estimated needed: {estimated_mb} MB, "
	f"Safety floor: {MEM_SAFETY_FLOOR_MB} MB. "
	"Try a smaller file or wait for active requests to complete.",
	507,
	)


	def _image_memory_estimate(raw: bytes, ext: str) -> int:
	try:
	if ext in {"heic", "heif"}:
	try:
	from pillow_heif import register_heif_opener
	register_heif_opener()
	except ImportError:
	pass
	img = Image.open(io.BytesIO(raw))
	w, h = img.size
	channels = len(img.getbands()) or 3
	img.close()
	return w * h * channels * IMAGE_MEMORY_FACTOR
	except Exception:
	return len(raw) * 20 # conservative fallback


	# ─────────────────────────────────────────────────────────────────────────────
	# PDF processor
	# ─────────────────────────────────────────────────────────────────────────────
	def _process_pdf(raw: bytes, filename: str, work_dir: str) -> dict[str, Any]:
	from magic_pdf.data.data_reader_writer import FileBasedDataWriter
	from magic_pdf.data.dataset import PymuDocDataset
	from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
	from magic_pdf.config.enums import SupportedPdfParseMethod

	images_dir = os.path.join(work_dir, "images")
	os.makedirs(images_dir, exist_ok=True)

	try:
	image_writer = FileBasedDataWriter(images_dir)
	except Exception as exc:
	raise _err("decode", "PDF_WRITER_FAILED", f"Could not create image writer: {exc}") from exc

	page_count = _pdf_page_count(raw)

	try:
	ds = PymuDocDataset(raw)
	method = ds.classify()
	except Exception as exc:
	raise _err("decode", "PDF_PARSE_FAILED", f"Could not parse PDF: {exc}") from exc

	try:
	if method == SupportedPdfParseMethod.TXT:
	infer_result = ds.apply(doc_analyze, ocr=False)
	pipe_result = infer_result.pipe_txt_mode(image_writer)
	parse_method = "txt"
	confidence = 0.95
	else:
	infer_result = ds.apply(doc_analyze, ocr=True)
	pipe_result = infer_result.pipe_ocr_mode(image_writer)
	parse_method = "ocr"
	confidence = 0.85
	except Exception as exc:
	raise _err("ocr", "OCR_PIPELINE_FAILED", f"doc_analyze/pipe failed: {exc}") from exc

	try:
	markdown = pipe_result.get_markdown(images_dir)
	except Exception as exc:
	raise _err("markdown", "MARKDOWN_FAILED", f"get_markdown failed: {exc}") from exc

	content_list = _safe_content_list(pipe_result, images_dir)
	doc_type = _classify_document(markdown, filename)

	return {
	"success": True,
	"filename": filename,
	"docType": doc_type,
	"pageCount": page_count,
	"confidence": confidence,
	"markdown": markdown,
	"metadata": {
	"parseMethod": parse_method,
	"backend": "pipeline",
	"docTypeClassification": doc_type,
	"imageCount": _count_images(content_list),
	"tableCount": _count_tables(content_list),
	"formulaCount": _count_formulas(content_list),
	},
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# Image processor
	# ─────────────────────────────────────────────────────────────────────────────
	def _process_image(raw: bytes, filename: str, ext: str, work_dir: str) -> dict[str, Any]:
	"""
	NOTE: read_local_images() is intentionally NOT used here.
	In magic-pdf 1.x it expects a single path string; passing a list causes:
	"stat: path should be string, bytes, os.PathLike or integer, not list"
	We use FileBasedDataReader + ImageDataset directly — explicit and safe.
	"""
	from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
	from magic_pdf.data.dataset import ImageDataset
	from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze

	images_dir = os.path.join(work_dir, "images")
	os.makedirs(images_dir, exist_ok=True)

	try:
	image_writer = FileBasedDataWriter(images_dir)
	except Exception as exc:
	raise _err("decode", "IMAGE_WRITER_FAILED", f"Could not create image writer: {exc}") from exc

	# Convert non-native formats to PNG before feeding to MinerU
	try:
	if ext in PILLOW_IMAGE_EXTENSIONS:
	raw = _convert_image_to_png(raw, ext)
	save_ext = "png"
	else:
	save_ext = ext
	except ExtractionError:
	raise
	except Exception as exc:
	raise _err("decode", "IMAGE_DECODE_FAILED", f"Could not decode image: {exc}") from exc

	img_filename = f"input.{save_ext}"
	img_path = os.path.join(work_dir, img_filename)
	try:
	with open(img_path, "wb") as fh:
	fh.write(raw)
	except OSError as exc:
	raise _err("decode", "WRITE_FAILED", f"Could not write temp image: {exc}") from exc

	try:
	# FileBasedDataReader(base_dir).read(relative_name) → bytes
	reader = FileBasedDataReader(work_dir)
	image_bytes = reader.read(img_filename)
	ds = ImageDataset(image_bytes)
	except Exception as exc:
	raise _err("decode", "IMAGE_DATASET_FAILED",
	f"Could not build ImageDataset: {exc}") from exc

	try:
	infer_result = ds.apply(doc_analyze, ocr=True)
	pipe_result = infer_result.pipe_ocr_mode(image_writer)
	except Exception as exc:
	raise _err("ocr", "OCR_PIPELINE_FAILED", f"doc_analyze/pipe failed: {exc}") from exc

	try:
	markdown = pipe_result.get_markdown(images_dir)
	except Exception as exc:
	raise _err("markdown", "MARKDOWN_FAILED", f"get_markdown failed: {exc}") from exc

	content_list = _safe_content_list(pipe_result, images_dir)
	doc_type = _classify_document(markdown, filename)

	return {
	"success": True,
	"filename": filename,
	"docType": doc_type,
	"pageCount": 1,
	"confidence": 0.85,
	"markdown": markdown,
	"metadata": {
	"parseMethod": "ocr",
	"backend": "pipeline",
	"docTypeClassification": doc_type,
	"imageCount": _count_images(content_list),
	"tableCount": _count_tables(content_list),
	"formulaCount": _count_formulas(content_list),
	},
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# Utility helpers
	# ─────────────────────────────────────────────────────────────────────────────
	def _sanitize_filename(name: str) -> str:
	name = os.path.basename(name)
	name = re.sub(r"[^\w.\-]", "_", name)
	return name[:200] or "upload"


	def _pdf_page_count(raw: bytes) -> int:
	try:
	doc = fitz.open(stream=raw, filetype="pdf")
	count = doc.page_count
	doc.close()
	return count
	except Exception:
	return 0


	def _convert_image_to_png(raw: bytes, ext: str) -> bytes:
	if ext in {"heic", "heif"}:
	try:
	from pillow_heif import register_heif_opener
	register_heif_opener()
	except ImportError:
	raise _err(
	"decode", "HEIF_NOT_SUPPORTED",
	"HEIC/HEIF support requires pillow-heif (not installed).",
	415,
	)
	try:
	img = Image.open(io.BytesIO(raw)).convert("RGB")
	buf = io.BytesIO()
	img.save(buf, format="PNG")
	return buf.getvalue()
	except Exception as exc:
	raise _err("decode", "IMAGE_DECODE_FAILED", f"Pillow could not open image: {exc}") from exc


	def _safe_content_list(pipe_result: Any, images_dir: str) -> list[dict]:
	try:
	return pipe_result.get_content_list(images_dir) or []
	except Exception:
	return []


	def _count_images(content_list: list[dict]) -> int:
	return sum(1 for item in content_list if item.get("type") == "image")


	def _count_tables(content_list: list[dict]) -> int:
	return sum(1 for item in content_list if item.get("type") == "table")


	def _count_formulas(content_list: list[dict]) -> int:
	return sum(
	1 for item in content_list
	if item.get("type") in {"equation", "formula", "interline_equation"}
	)


	def _classify_document(markdown: str, filename: str) -> str:
	"""Keyword-based document type heuristic over extracted Markdown + filename."""
	text = (markdown + " " + filename).lower()

	rules: list[tuple[str, list[str]]] = [
	("invoice", ["invoice", "bill to", "invoice number", "invoice #",
	"due date", "amount due", "subtotal", "tax invoice"]),
	("receipt", ["receipt", "thank you for your purchase", "order total",
	"payment received", "transaction id", "cash receipt"]),
	("marksheet", ["marksheet", "mark sheet", "grade sheet", "scorecard",
	"score card", "cgpa", "sgpa", "semester result",
	"result sheet", "marks obtained"]),
	("resume", ["curriculum vitae", "cv", "resume", "work experience",
	"education", "skills", "references", "objective",
	"professional summary"]),
	("research paper", ["abstract", "introduction", "methodology", "conclusion",
	"references", "keywords", "doi:", "arxiv", "journal",
	"proceedings"]),
	("form", ["please fill", "signature", "date of birth", "applicant",
	"application form", "form no", "checkbox", "tick", "field"]),
	("contract", ["agreement", "hereby", "whereas", "terms and conditions",
	"party of the first", "signed by", "witnesseth",
	"indemnify", "governing law"]),
	("screenshot", ["screenshot", "screen capture", "url:", "http://",
	"https://", "browser", "toolbar", "desktop"]),
	]

	scores: dict[str, int] = {}
	for doc_type, keywords in rules:
	score = sum(1 for kw in keywords if kw in text)
	if score:
	scores[doc_type] = score

	return max(scores, key=lambda k: scores[k]) if scores else "generic document"


	# ─────────────────────────────────────────────────────────────────────────────
	# Memory — cgroup-aware (fixes "105 GB / 123 GB" /proc/meminfo host bleed)
	# ─────────────────────────────────────────────────────────────────────────────
	def _mem_mb() -> tuple[int, int]:
	"""
	Return (used_mb, total_mb) for the CONTAINER, not the host.

	Priority:
	1. cgroups v2 /sys/fs/cgroup/memory.max + memory.current
	2. cgroups v1 /sys/fs/cgroup/memory/memory.limit_in_bytes + usage_in_bytes
	3. /proc/meminfo fallback (may show host memory in Docker — known inaccuracy)

	/proc/meminfo is last resort because HF Docker containers typically do NOT
	have cgroup memory limits mapped into /proc, so it shows the physical host
	RAM (e.g. 123 GB on a 128 GB bare-metal host), misleading the memory guard.
	"""
	# ── cgroups v2 (preferred — modern Docker / HF Spaces) ───────────────────
	try:
	with open("/sys/fs/cgroup/memory.max") as f:
	raw_max = f.read().strip()
	if raw_max != "max":
	limit_bytes = int(raw_max)
	with open("/sys/fs/cgroup/memory.current") as f:
	used_bytes = int(f.read().strip())
	if limit_bytes > 0:
	return used_bytes // (1024 * 1024), limit_bytes // (1024 * 1024)
	except (FileNotFoundError, ValueError, OSError):
	pass

	# ── cgroups v1 ────────────────────────────────────────────────────────────
	try:
	with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f:
	limit_bytes = int(f.read().strip())
	with open("/sys/fs/cgroup/memory/memory.usage_in_bytes") as f:
	used_bytes = int(f.read().strip())
	# Unconstrained cgroup reports a sentinel > 1 PB; skip it
	if limit_bytes < 128 * 1024 * 1024 * 1024:
	return used_bytes // (1024 * 1024), limit_bytes // (1024 * 1024)
	except (FileNotFoundError, ValueError, OSError):
	pass

	# ── /proc/meminfo fallback ────────────────────────────────────────────────
	try:
	info: dict[str, int] = {}
	with open("/proc/meminfo") as f:
	for line in f:
	parts = line.split()
	if len(parts) >= 2:
	info[parts[0].rstrip(":")] = int(parts[1]) # values are in kB
	total_kb = info.get("MemTotal", 0)
	avail_kb = info.get("MemAvailable", 0)
	used_kb = total_kb - avail_kb
	return used_kb // 1024, total_kb // 1024
	except Exception:
	return 0, 0


	def _mineru_version() -> str:
	for pkg in ("magic-pdf", "mineru"):
	try:
	return pkg_version(pkg)
	except Exception:
	continue
	return "unknown"