# Unified backend OCR processor using Google Gemini import os import base64 import logging import time from typing import List, Dict, Optional import google.generativeai as genai logging.basicConfig( level=logging.INFO, format='[%(asctime)s] %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) SUPPORTED_IMAGE_TYPES = {"jpg", "jpeg", "png", "bmp", "webp", "tiff"} def encode_image_to_base64(image_path: str) -> Optional[str]: if not os.path.exists(image_path): logger.error(f"Image not found: {image_path}") return None try: with open(image_path, "rb") as img_file: return base64.b64encode(img_file.read()).decode("utf-8") except Exception as e: logger.error(f"Failed to read or encode image {image_path}: {e}") return None def get_mime_type(image_path: str) -> Optional[str]: ext = image_path.split(".")[-1].lower() if ext in SUPPORTED_IMAGE_TYPES: return f"image/{'jpeg' if ext == 'jpg' else ext}" logger.warning(f"Unsupported image format: {ext}") return None def run_gemini_ocr(image_path: str, api_key: str, max_retries: int = 3) -> str: genai.configure(api_key=api_key) model = genai.GenerativeModel("gemini-1.5-flash") base64_image = encode_image_to_base64(image_path) mime_type = get_mime_type(image_path) if base64_image is None or mime_type is None: return "[Image could not be processed]" prompt = ( "Extract all visible Odia (ଓଡ଼ିଆ) text from the image accurately.\n" "Only output the Odia text content. Do not explain or translate anything.\n" "If no Odia text is found, return '[No Odia text found]'." ) for attempt in range(max_retries): try: response = model.generate_content( [ prompt, { "mime_type": mime_type, "data": base64_image } ], generation_config={ "temperature": 0.2, "max_output_tokens": 2048, "top_p": 0.8, "top_k": 40 } ) text = response.text.strip() if response.text else "[No text extracted]" logger.info(f"OCR complete for {os.path.basename(image_path)}") return text except Exception as e: logger.error(f"OCR attempt {attempt + 1} failed for {image_path}: {e}") if attempt == max_retries - 1: return f"[OCR failed after {max_retries} attempts: {str(e)}]" time.sleep(1) def batch_run_ocr(image_filenames: List[str], image_folder: str, api_key: str) -> Dict[str, str]: results: Dict[str, str] = {} logger.info(f"Starting batch OCR on {len(image_filenames)} images.") for filename in image_filenames: image_path = os.path.join(image_folder, filename) if not os.path.exists(image_path): logger.error(f"Image not found: {image_path}") results[filename] = "[Image file not found]" continue results[filename] = run_gemini_ocr(image_path, api_key) logger.info("Batch OCR complete.") return results