docling-parser / gemini.py
Ibad ur Rehman
feat: deploy docling first parser
74cacc0
"""Gemini API extraction function for table page enhancement."""
import base64
import re
import time
from typing import Optional
import httpx
from config import GEMINI_API_KEY, GEMINI_MODEL, GEMINI_TIMEOUT, logger
_CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE)
_CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)
def _gemini_extract_page(
page_image_bytes: bytes, request_id: str = "", page_no: int = 0
) -> Optional[str]:
"""Send a page image to Gemini for high-quality extraction."""
if not GEMINI_API_KEY:
logger.warning(f"[{request_id}] GEMINI_API_KEY not set; skipping Gemini extraction")
return None
b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
payload = {
"contents": [
{
"parts": [
{"inline_data": {"mime_type": "image/png", "data": b64_image}},
{
"text": (
"Convert this document page to clean markdown format.\n\n"
"Rules:\n"
"- Extract ALL text content exactly as written; do not paraphrase\n"
"- Use ## for main headings and ### for subsection headings\n"
"- Preserve lists, paragraphs, bullet points, and structure\n"
"- For tables, format them as HTML using <table>, <thead>, <tbody>, <tr>, <th>, <td>\n"
"- Include ALL columns and preserve numbers, dates, and lease terms exactly\n"
"- Use <br> for line breaks within table cells\n"
"- Do NOT wrap output in code fences\n"
"- Do NOT include image descriptions, branding, headers, or footers\n"
"- Output ONLY the extracted content"
)
},
]
}
],
"generationConfig": {"temperature": 0.1, "maxOutputTokens": 32768},
}
url = (
f"https://generativelanguage.googleapis.com/v1beta/models/"
f"{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
)
for attempt in range(1, 3):
try:
timeout = GEMINI_TIMEOUT * (1.5 if attempt > 1 else 1.0)
response = httpx.post(url, json=payload, timeout=timeout)
if response.status_code == 429:
logger.warning(
f"[{request_id}] Gemini rate limited on page {page_no + 1}, attempt {attempt}"
)
time.sleep(5)
continue
if response.status_code != 200:
try:
err = response.json()
msg = str(err.get("error", {}).get("message", str(err)[:300]))
except Exception:
msg = response.text[:300]
logger.error(
f"[{request_id}] Gemini error ({response.status_code}) page {page_no + 1}: {msg}"
)
if attempt == 1:
continue
return None
result = response.json()
candidates = result.get("candidates", [])
if not candidates:
logger.warning(f"[{request_id}] Gemini returned no candidates for page {page_no + 1}")
return None
parts = candidates[0].get("content", {}).get("parts", [])
if not parts:
return None
content = parts[0].get("text", "")
content = _CODE_FENCE_PATTERN.sub("", content)
content = _CODE_FENCE_END.sub("", content)
return content.strip() or None
except (httpx.TimeoutException, httpx.ConnectError) as e:
if attempt == 1:
logger.warning(
f"[{request_id}] Gemini attempt {attempt} failed on page {page_no + 1}: {e}"
)
continue
logger.error(f"[{request_id}] Gemini failed after retries on page {page_no + 1}: {e}")
return None
return None