Spaces:

Sk4467
/

OCR-annotation

Sleeping

App Files Files Community

OCR-annotation / backend /app /services /ocr_processor.py

Sk4467

Upload 108 files

1e83c8a verified 4 months ago

raw

history blame contribute delete

3.33 kB

	# Unified backend OCR processor using Google Gemini

	import os
	import base64
	import logging
	import time
	from typing import List, Dict, Optional

	import google.generativeai as genai

	logging.basicConfig(
	level=logging.INFO,
	format='[%(asctime)s] %(levelname)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)
	logger = logging.getLogger(__name__)


	SUPPORTED_IMAGE_TYPES = {"jpg", "jpeg", "png", "bmp", "webp", "tiff"}


	def encode_image_to_base64(image_path: str) -> Optional[str]:
	if not os.path.exists(image_path):
	logger.error(f"Image not found: {image_path}")
	return None
	try:
	with open(image_path, "rb") as img_file:
	return base64.b64encode(img_file.read()).decode("utf-8")
	except Exception as e:
	logger.error(f"Failed to read or encode image {image_path}: {e}")
	return None


	def get_mime_type(image_path: str) -> Optional[str]:
	ext = image_path.split(".")[-1].lower()
	if ext in SUPPORTED_IMAGE_TYPES:
	return f"image/{'jpeg' if ext == 'jpg' else ext}"
	logger.warning(f"Unsupported image format: {ext}")
	return None


	def run_gemini_ocr(image_path: str, api_key: str, max_retries: int = 3) -> str:
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel("gemini-1.5-flash")

	base64_image = encode_image_to_base64(image_path)
	mime_type = get_mime_type(image_path)

	if base64_image is None or mime_type is None:
	return "[Image could not be processed]"

	prompt = (
	"Extract all visible Odia (ଓଡ଼ିଆ) text from the image accurately.\n"
	"Only output the Odia text content. Do not explain or translate anything.\n"
	"If no Odia text is found, return '[No Odia text found]'."
	)

	for attempt in range(max_retries):
	try:
	response = model.generate_content(
	[
	prompt,
	{
	"mime_type": mime_type,
	"data": base64_image
	}
	],
	generation_config={
	"temperature": 0.2,
	"max_output_tokens": 2048,
	"top_p": 0.8,
	"top_k": 40
	}
	)

	text = response.text.strip() if response.text else "[No text extracted]"
	logger.info(f"OCR complete for {os.path.basename(image_path)}")
	return text

	except Exception as e:
	logger.error(f"OCR attempt {attempt + 1} failed for {image_path}: {e}")
	if attempt == max_retries - 1:
	return f"[OCR failed after {max_retries} attempts: {str(e)}]"
	time.sleep(1)


	def batch_run_ocr(image_filenames: List[str], image_folder: str, api_key: str) -> Dict[str, str]:
	results: Dict[str, str] = {}
	logger.info(f"Starting batch OCR on {len(image_filenames)} images.")

	for filename in image_filenames:
	image_path = os.path.join(image_folder, filename)
	if not os.path.exists(image_path):
	logger.error(f"Image not found: {image_path}")
	results[filename] = "[Image file not found]"
	continue

	results[filename] = run_gemini_ocr(image_path, api_key)

	logger.info("Batch OCR complete.")
	return results