Spaces:

outcomelabs
/

docling-parser

Running on T4

Ibad ur Rehman

feat: deploy docling first parser

74cacc0 29 days ago

4.17 kB

	"""Gemini API extraction function for table page enhancement."""

	import base64
	import re
	import time
	from typing import Optional

	import httpx

	from config import GEMINI_API_KEY, GEMINI_MODEL, GEMINI_TIMEOUT, logger

	_CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown\|md\|text)?\s*\n?", re.MULTILINE)
	_CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)


	def _gemini_extract_page(
	page_image_bytes: bytes, request_id: str = "", page_no: int = 0
	) -> Optional[str]:
	"""Send a page image to Gemini for high-quality extraction."""
	if not GEMINI_API_KEY:
	logger.warning(f"[{request_id}] GEMINI_API_KEY not set; skipping Gemini extraction")
	return None

	b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
	payload = {
	"contents": [
	{
	"parts": [
	{"inline_data": {"mime_type": "image/png", "data": b64_image}},
	{
	"text": (
	"Convert this document page to clean markdown format.\n\n"
	"Rules:\n"
	"- Extract ALL text content exactly as written; do not paraphrase\n"
	"- Use ## for main headings and ### for subsection headings\n"
	"- Preserve lists, paragraphs, bullet points, and structure\n"
	"- For tables, format them as HTML using <table>, <thead>, <tbody>, <tr>, <th>, <td>\n"
	"- Include ALL columns and preserve numbers, dates, and lease terms exactly\n"
	"- Use <br> for line breaks within table cells\n"
	"- Do NOT wrap output in code fences\n"
	"- Do NOT include image descriptions, branding, headers, or footers\n"
	"- Output ONLY the extracted content"
	)
	},
	]
	}
	],
	"generationConfig": {"temperature": 0.1, "maxOutputTokens": 32768},
	}

	url = (
	f"https://generativelanguage.googleapis.com/v1beta/models/"
	f"{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
	)

	for attempt in range(1, 3):
	try:
	timeout = GEMINI_TIMEOUT * (1.5 if attempt > 1 else 1.0)
	response = httpx.post(url, json=payload, timeout=timeout)

	if response.status_code == 429:
	logger.warning(
	f"[{request_id}] Gemini rate limited on page {page_no + 1}, attempt {attempt}"
	)
	time.sleep(5)
	continue

	if response.status_code != 200:
	try:
	err = response.json()
	msg = str(err.get("error", {}).get("message", str(err)[:300]))
	except Exception:
	msg = response.text[:300]
	logger.error(
	f"[{request_id}] Gemini error ({response.status_code}) page {page_no + 1}: {msg}"
	)
	if attempt == 1:
	continue
	return None

	result = response.json()
	candidates = result.get("candidates", [])
	if not candidates:
	logger.warning(f"[{request_id}] Gemini returned no candidates for page {page_no + 1}")
	return None

	parts = candidates[0].get("content", {}).get("parts", [])
	if not parts:
	return None

	content = parts[0].get("text", "")
	content = _CODE_FENCE_PATTERN.sub("", content)
	content = _CODE_FENCE_END.sub("", content)
	return content.strip() or None
	except (httpx.TimeoutException, httpx.ConnectError) as e:
	if attempt == 1:
	logger.warning(
	f"[{request_id}] Gemini attempt {attempt} failed on page {page_no + 1}: {e}"
	)
	continue
	logger.error(f"[{request_id}] Gemini failed after retries on page {page_no + 1}: {e}")
	return None

	return None