Spaces:
Running on T4
Running on T4
| """Gemini API extraction function for table page enhancement.""" | |
| import base64 | |
| import re | |
| import time | |
| from typing import Optional | |
| import httpx | |
| from config import GEMINI_API_KEY, GEMINI_MODEL, GEMINI_TIMEOUT, logger | |
| _CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE) | |
| _CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE) | |
| def _gemini_extract_page( | |
| page_image_bytes: bytes, request_id: str = "", page_no: int = 0 | |
| ) -> Optional[str]: | |
| """Send a page image to Gemini for high-quality extraction.""" | |
| if not GEMINI_API_KEY: | |
| logger.warning(f"[{request_id}] GEMINI_API_KEY not set; skipping Gemini extraction") | |
| return None | |
| b64_image = base64.b64encode(page_image_bytes).decode("utf-8") | |
| payload = { | |
| "contents": [ | |
| { | |
| "parts": [ | |
| {"inline_data": {"mime_type": "image/png", "data": b64_image}}, | |
| { | |
| "text": ( | |
| "Convert this document page to clean markdown format.\n\n" | |
| "Rules:\n" | |
| "- Extract ALL text content exactly as written; do not paraphrase\n" | |
| "- Use ## for main headings and ### for subsection headings\n" | |
| "- Preserve lists, paragraphs, bullet points, and structure\n" | |
| "- For tables, format them as HTML using <table>, <thead>, <tbody>, <tr>, <th>, <td>\n" | |
| "- Include ALL columns and preserve numbers, dates, and lease terms exactly\n" | |
| "- Use <br> for line breaks within table cells\n" | |
| "- Do NOT wrap output in code fences\n" | |
| "- Do NOT include image descriptions, branding, headers, or footers\n" | |
| "- Output ONLY the extracted content" | |
| ) | |
| }, | |
| ] | |
| } | |
| ], | |
| "generationConfig": {"temperature": 0.1, "maxOutputTokens": 32768}, | |
| } | |
| url = ( | |
| f"https://generativelanguage.googleapis.com/v1beta/models/" | |
| f"{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}" | |
| ) | |
| for attempt in range(1, 3): | |
| try: | |
| timeout = GEMINI_TIMEOUT * (1.5 if attempt > 1 else 1.0) | |
| response = httpx.post(url, json=payload, timeout=timeout) | |
| if response.status_code == 429: | |
| logger.warning( | |
| f"[{request_id}] Gemini rate limited on page {page_no + 1}, attempt {attempt}" | |
| ) | |
| time.sleep(5) | |
| continue | |
| if response.status_code != 200: | |
| try: | |
| err = response.json() | |
| msg = str(err.get("error", {}).get("message", str(err)[:300])) | |
| except Exception: | |
| msg = response.text[:300] | |
| logger.error( | |
| f"[{request_id}] Gemini error ({response.status_code}) page {page_no + 1}: {msg}" | |
| ) | |
| if attempt == 1: | |
| continue | |
| return None | |
| result = response.json() | |
| candidates = result.get("candidates", []) | |
| if not candidates: | |
| logger.warning(f"[{request_id}] Gemini returned no candidates for page {page_no + 1}") | |
| return None | |
| parts = candidates[0].get("content", {}).get("parts", []) | |
| if not parts: | |
| return None | |
| content = parts[0].get("text", "") | |
| content = _CODE_FENCE_PATTERN.sub("", content) | |
| content = _CODE_FENCE_END.sub("", content) | |
| return content.strip() or None | |
| except (httpx.TimeoutException, httpx.ConnectError) as e: | |
| if attempt == 1: | |
| logger.warning( | |
| f"[{request_id}] Gemini attempt {attempt} failed on page {page_no + 1}: {e}" | |
| ) | |
| continue | |
| logger.error(f"[{request_id}] Gemini failed after retries on page {page_no + 1}: {e}") | |
| return None | |
| return None | |