Sk4467's picture
Upload 108 files
1e83c8a verified
# Unified backend OCR processor using Google Gemini
import os
import base64
import logging
import time
from typing import List, Dict, Optional
import google.generativeai as genai
logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s] %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
SUPPORTED_IMAGE_TYPES = {"jpg", "jpeg", "png", "bmp", "webp", "tiff"}
def encode_image_to_base64(image_path: str) -> Optional[str]:
if not os.path.exists(image_path):
logger.error(f"Image not found: {image_path}")
return None
try:
with open(image_path, "rb") as img_file:
return base64.b64encode(img_file.read()).decode("utf-8")
except Exception as e:
logger.error(f"Failed to read or encode image {image_path}: {e}")
return None
def get_mime_type(image_path: str) -> Optional[str]:
ext = image_path.split(".")[-1].lower()
if ext in SUPPORTED_IMAGE_TYPES:
return f"image/{'jpeg' if ext == 'jpg' else ext}"
logger.warning(f"Unsupported image format: {ext}")
return None
def run_gemini_ocr(image_path: str, api_key: str, max_retries: int = 3) -> str:
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-1.5-flash")
base64_image = encode_image_to_base64(image_path)
mime_type = get_mime_type(image_path)
if base64_image is None or mime_type is None:
return "[Image could not be processed]"
prompt = (
"Extract all visible Odia (ଓଡ଼ିଆ) text from the image accurately.\n"
"Only output the Odia text content. Do not explain or translate anything.\n"
"If no Odia text is found, return '[No Odia text found]'."
)
for attempt in range(max_retries):
try:
response = model.generate_content(
[
prompt,
{
"mime_type": mime_type,
"data": base64_image
}
],
generation_config={
"temperature": 0.2,
"max_output_tokens": 2048,
"top_p": 0.8,
"top_k": 40
}
)
text = response.text.strip() if response.text else "[No text extracted]"
logger.info(f"OCR complete for {os.path.basename(image_path)}")
return text
except Exception as e:
logger.error(f"OCR attempt {attempt + 1} failed for {image_path}: {e}")
if attempt == max_retries - 1:
return f"[OCR failed after {max_retries} attempts: {str(e)}]"
time.sleep(1)
def batch_run_ocr(image_filenames: List[str], image_folder: str, api_key: str) -> Dict[str, str]:
results: Dict[str, str] = {}
logger.info(f"Starting batch OCR on {len(image_filenames)} images.")
for filename in image_filenames:
image_path = os.path.join(image_folder, filename)
if not os.path.exists(image_path):
logger.error(f"Image not found: {image_path}")
results[filename] = "[Image file not found]"
continue
results[filename] = run_gemini_ocr(image_path, api_key)
logger.info("Batch OCR complete.")
return results