Spaces:

sameernotes
/

hindi-o

Sleeping

File size: 4,846 Bytes

import gradio as gr
import cv2
import numpy as np
import tensorflow as tf
import pickle
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import sakshi_ocr
import os
import io
import sys
import tempfile
import requests

# URLs for the model and encoder hosted on Hugging Face
MODEL_URL = "https://huggingface.co/sameernotes/hindi-ocr/resolve/main/hindi_ocr_model.keras"
ENCODER_URL = "https://huggingface.co/sameernotes/hindi-ocr/resolve/main/label_encoder.pkl"
FONT_URL = "https://huggingface.co/sameernotes/hindi-ocr/resolve/main/NotoSansDevanagari-Regular.ttf"  # Optional font

# Download model and encoder
def download_file(url, dest):
    response = requests.get(url)
    with open(dest, 'wb') as f:
        f.write(response.content)

# Paths for local storage in Hugging Face Spaces
MODEL_PATH = "hindi_ocr_model.keras"
ENCODER_PATH = "label_encoder.pkl"
FONT_PATH = "NotoSansDevanagari-Regular.ttf"

# Download models and font if not already present
if not os.path.exists(MODEL_PATH):
    download_file(MODEL_URL, MODEL_PATH)
if not os.path.exists(ENCODER_PATH):
    download_file(ENCODER_URL, ENCODER_PATH)
if not os.path.exists(FONT_PATH):
    download_file(FONT_URL, FONT_PATH)

# Load the custom font if available
if os.path.exists(FONT_PATH):
    fm.fontManager.addfont(FONT_PATH)
    plt.rcParams['font.family'] = 'Noto Sans Devanagari'

# Load the model and encoder
def load_model():
    if not os.path.exists(MODEL_PATH):
        return None
    return tf.keras.models.load_model(MODEL_PATH)

def load_label_encoder():
    if not os.path.exists(ENCODER_PATH):
        return None
    with open(ENCODER_PATH, 'rb') as f:
        return pickle.load(f)

model = load_model()
label_encoder = load_label_encoder()

# Word detection function
def detect_words(image):
    _, binary = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    kernel = np.ones((3,3), np.uint8)
    dilated = cv2.dilate(binary, kernel, iterations=2)
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    word_img = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    word_count = 0
    
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if w > 10 and h > 10:
            cv2.rectangle(word_img, (x, y), (x+w, y+h), (0, 255, 0), 2)
            word_count += 1
    
    return word_img, word_count

# Sakshi OCR output capture
def run_sakshi_ocr(image_path):
    buffer = io.StringIO()
    old_stdout = sys.stdout
    sys.stdout = buffer
    try:
        sakshi_ocr.generate(image_path)
    finally:
        sys.stdout = old_stdout
    return buffer.getvalue()

# Main OCR processing function
def process_image(image):
    if image is None:
        return "Error: No image provided", None, 0, "No prediction available"

    # Convert PIL image to OpenCV format (grayscale)
    img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    
    # Word detection
    word_detected_img, word_count = detect_words(img)
    
    # First OCR model prediction
    try:
        img_resized = cv2.resize(img, (128, 32))
        img_norm = img_resized / 255.0
        img_input = img_norm[np.newaxis, ..., np.newaxis]  # Shape: (1, 32, 128, 1)
        
        if model is not None and label_encoder is not None:
            pred = model.predict(img_input)
            pred_label_idx = np.argmax(pred)
            pred_label = label_encoder.inverse_transform([pred_label_idx])[0]
            
            # Create plot with prediction
            fig, ax = plt.subplots()
            ax.imshow(img, cmap='gray')
            ax.set_title(f"Predicted: {pred_label}", fontsize=12)
            ax.axis('off')
            plt.savefig("temp_plot.png")
            plt.close()
            pred_image = cv2.imread("temp_plot.png")
            os.remove("temp_plot.png")
        else:
            pred_image = None
            pred_label = "Model or encoder not loaded"
    except Exception as e:
        pred_image = None
        pred_label = f"Error: {str(e)}"
    
    # Sakshi OCR processing
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_file:
        cv2.imwrite(tmp_file.name, img)
        sakshi_output = run_sakshi_ocr(tmp_file.name)
        os.remove(tmp_file.name)
    
    return sakshi_output, word_detected_img, word_count, pred_image

# Gradio Interface
interface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil", label="Upload an Image"),
    outputs=[
        gr.Textbox(label="Sakshi OCR Output"),
        gr.Image(label="Word Detection", type="numpy"),
        gr.Number(label="Word Count"),
        gr.Image(label="Hindi OCR Prediction", type="numpy")
    ],
    title="Hindi OCR App by Sakshi",
    description="Upload an image to perform Hindi OCR and word detection."
)

# Launch the app
interface.launch()