"""Gemini API extraction function for table page enhancement.""" import base64 import re import time from typing import Optional import httpx from config import GEMINI_API_KEY, GEMINI_MODEL, GEMINI_TIMEOUT, logger _CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE) _CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE) def _gemini_extract_page( page_image_bytes: bytes, request_id: str = "", page_no: int = 0 ) -> Optional[str]: """Send a page image to Gemini for high-quality extraction.""" if not GEMINI_API_KEY: logger.warning(f"[{request_id}] GEMINI_API_KEY not set; skipping Gemini extraction") return None b64_image = base64.b64encode(page_image_bytes).decode("utf-8") payload = { "contents": [ { "parts": [ {"inline_data": {"mime_type": "image/png", "data": b64_image}}, { "text": ( "Convert this document page to clean markdown format.\n\n" "Rules:\n" "- Extract ALL text content exactly as written; do not paraphrase\n" "- Use ## for main headings and ### for subsection headings\n" "- Preserve lists, paragraphs, bullet points, and structure\n" "- For tables, format them as HTML using
| , | \n"
"- Include ALL columns and preserve numbers, dates, and lease terms exactly\n"
"- Use for line breaks within table cells\n" "- Do NOT wrap output in code fences\n" "- Do NOT include image descriptions, branding, headers, or footers\n" "- Output ONLY the extracted content" ) }, ] } ], "generationConfig": {"temperature": 0.1, "maxOutputTokens": 32768}, } url = ( f"https://generativelanguage.googleapis.com/v1beta/models/" f"{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}" ) for attempt in range(1, 3): try: timeout = GEMINI_TIMEOUT * (1.5 if attempt > 1 else 1.0) response = httpx.post(url, json=payload, timeout=timeout) if response.status_code == 429: logger.warning( f"[{request_id}] Gemini rate limited on page {page_no + 1}, attempt {attempt}" ) time.sleep(5) continue if response.status_code != 200: try: err = response.json() msg = str(err.get("error", {}).get("message", str(err)[:300])) except Exception: msg = response.text[:300] logger.error( f"[{request_id}] Gemini error ({response.status_code}) page {page_no + 1}: {msg}" ) if attempt == 1: continue return None result = response.json() candidates = result.get("candidates", []) if not candidates: logger.warning(f"[{request_id}] Gemini returned no candidates for page {page_no + 1}") return None parts = candidates[0].get("content", {}).get("parts", []) if not parts: return None content = parts[0].get("text", "") content = _CODE_FENCE_PATTERN.sub("", content) content = _CODE_FENCE_END.sub("", content) return content.strip() or None except (httpx.TimeoutException, httpx.ConnectError) as e: if attempt == 1: logger.warning( f"[{request_id}] Gemini attempt {attempt} failed on page {page_no + 1}: {e}" ) continue logger.error(f"[{request_id}] Gemini failed after retries on page {page_no + 1}: {e}") return None return None |
|---|