Spaces:
Sleeping
Sleeping
| # Unified backend OCR processor using Google Gemini | |
| import os | |
| import base64 | |
| import logging | |
| import time | |
| from typing import List, Dict, Optional | |
| import google.generativeai as genai | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='[%(asctime)s] %(levelname)s - %(message)s', | |
| datefmt='%Y-%m-%d %H:%M:%S' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| SUPPORTED_IMAGE_TYPES = {"jpg", "jpeg", "png", "bmp", "webp", "tiff"} | |
| def encode_image_to_base64(image_path: str) -> Optional[str]: | |
| if not os.path.exists(image_path): | |
| logger.error(f"Image not found: {image_path}") | |
| return None | |
| try: | |
| with open(image_path, "rb") as img_file: | |
| return base64.b64encode(img_file.read()).decode("utf-8") | |
| except Exception as e: | |
| logger.error(f"Failed to read or encode image {image_path}: {e}") | |
| return None | |
| def get_mime_type(image_path: str) -> Optional[str]: | |
| ext = image_path.split(".")[-1].lower() | |
| if ext in SUPPORTED_IMAGE_TYPES: | |
| return f"image/{'jpeg' if ext == 'jpg' else ext}" | |
| logger.warning(f"Unsupported image format: {ext}") | |
| return None | |
| def run_gemini_ocr(image_path: str, api_key: str, max_retries: int = 3) -> str: | |
| genai.configure(api_key=api_key) | |
| model = genai.GenerativeModel("gemini-1.5-flash") | |
| base64_image = encode_image_to_base64(image_path) | |
| mime_type = get_mime_type(image_path) | |
| if base64_image is None or mime_type is None: | |
| return "[Image could not be processed]" | |
| prompt = ( | |
| "Extract all visible Odia (ଓଡ଼ିଆ) text from the image accurately.\n" | |
| "Only output the Odia text content. Do not explain or translate anything.\n" | |
| "If no Odia text is found, return '[No Odia text found]'." | |
| ) | |
| for attempt in range(max_retries): | |
| try: | |
| response = model.generate_content( | |
| [ | |
| prompt, | |
| { | |
| "mime_type": mime_type, | |
| "data": base64_image | |
| } | |
| ], | |
| generation_config={ | |
| "temperature": 0.2, | |
| "max_output_tokens": 2048, | |
| "top_p": 0.8, | |
| "top_k": 40 | |
| } | |
| ) | |
| text = response.text.strip() if response.text else "[No text extracted]" | |
| logger.info(f"OCR complete for {os.path.basename(image_path)}") | |
| return text | |
| except Exception as e: | |
| logger.error(f"OCR attempt {attempt + 1} failed for {image_path}: {e}") | |
| if attempt == max_retries - 1: | |
| return f"[OCR failed after {max_retries} attempts: {str(e)}]" | |
| time.sleep(1) | |
| def batch_run_ocr(image_filenames: List[str], image_folder: str, api_key: str) -> Dict[str, str]: | |
| results: Dict[str, str] = {} | |
| logger.info(f"Starting batch OCR on {len(image_filenames)} images.") | |
| for filename in image_filenames: | |
| image_path = os.path.join(image_folder, filename) | |
| if not os.path.exists(image_path): | |
| logger.error(f"Image not found: {image_path}") | |
| results[filename] = "[Image file not found]" | |
| continue | |
| results[filename] = run_gemini_ocr(image_path, api_key) | |
| logger.info("Batch OCR complete.") | |
| return results | |