Spaces:

sameernotes
/

ocr

Sleeping

ocr

File size: 5,633 Bytes

6ae7708

import os
import io
import sys
import cv2
import base64
import pickle
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import tempfile
import sakshi_ocr

from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import HTMLResponse, JSONResponse

# Define paths to your assets (update these if necessary)
MODEL_PATH = 'hindi_ocr_model.keras'
ENCODER_PATH = 'label_encoder.pkl'
FONT_PATH = 'NotoSansDevanagari-Regular.ttf'

# Load custom font if available
if os.path.exists(FONT_PATH):
    fm.fontManager.addfont(FONT_PATH)
    plt.rcParams['font.family'] = 'Noto Sans Devanagari'
else:
    print("Custom font not found. Using default font.")

# Load the OCR model
def load_model():
    if not os.path.exists(MODEL_PATH):
        raise FileNotFoundError(f"Model file not found at {MODEL_PATH}")
    return tf.keras.models.load_model(MODEL_PATH)

# Load the label encoder
def load_label_encoder():
    if not os.path.exists(ENCODER_PATH):
        raise FileNotFoundError(f"Label encoder file not found at {ENCODER_PATH}")
    with open(ENCODER_PATH, 'rb') as f:
        return pickle.load(f)

# Global loading so they persist across requests
model = load_model()
label_encoder = load_label_encoder()

# Function for word detection
def detect_words(image):
    # Assume input is a grayscale image
    _, binary = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    kernel = np.ones((3, 3), np.uint8)
    dilated = cv2.dilate(binary, kernel, iterations=2)
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    word_img = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    word_count = 0
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if w > 10 and h > 10:
            cv2.rectangle(word_img, (x, y), (x+w, y+h), (0, 255, 0), 2)
            word_count += 1
    return word_img, word_count

# Function to run Sakshi OCR and capture its output
def run_sakshi_ocr(image_path):
    buffer = io.StringIO()
    old_stdout = sys.stdout
    sys.stdout = buffer
    try:
        sakshi_ocr.generate(image_path)
    finally:
        sys.stdout = old_stdout
    return buffer.getvalue()

# Utility function: convert image (numpy array) to a base64 encoded string
def image_to_base64(image, ext=".png"):
    success, encoded_image = cv2.imencode(ext, image)
    if not success:
        return None
    return base64.b64encode(encoded_image).decode('utf-8')

# Initialize FastAPI app
app = FastAPI(title="Hindi OCR App by sakshi")

@app.get("/", response_class=HTMLResponse)
async def root():
    html_content = """
    <html>
      <head>
        <title>Hindi OCR App by sakshi</title>
      </head>
      <body>
        <h1>Hindi OCR App by sakshi</h1>
        <form action="/predict" enctype="multipart/form-data" method="post">
          <input name="file" type="file" accept="image/*">
          <input type="submit" value="Upload and Predict">
        </form>
      </body>
    </html>
    """
    return HTMLResponse(content=html_content)

@app.post("/predict")
async def predict(file: UploadFile = File(...)):
    # Read and decode the uploaded image
    contents = await file.read()
    nparr = np.frombuffer(contents, np.uint8)
    img = cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise HTTPException(status_code=400, detail="Error reading the image.")
    
    # Encode the original image to base64 for visualization
    original_image = image_to_base64(cv2.cvtColor(img, cv2.COLOR_GRAY2BGR))
    
    # Word detection
    word_img, word_count = detect_words(img)
    word_img_encoded = image_to_base64(word_img)
    
    # OCR model prediction for single word
    try:
        img_resized = cv2.resize(img, (128, 32))
        img_norm = img_resized / 255.0
        img_input = img_norm[np.newaxis, ..., np.newaxis]  # shape: (1, 32, 128, 1)
        pred = model.predict(img_input)
        pred_label_idx = np.argmax(pred)
        pred_label = label_encoder.inverse_transform([pred_label_idx])[0]
        
        # Generate an image with the prediction using matplotlib
        fig, ax = plt.subplots()
        ax.imshow(img, cmap='gray')
        ax.set_title(f"Predicted: {pred_label}", fontsize=12)
        ax.axis('off')
        buf = io.BytesIO()
        plt.savefig(buf, format="png")
        buf.seek(0)
        pred_img_array = np.frombuffer(buf.getvalue(), np.uint8)
        prediction_img = cv2.imdecode(pred_img_array, cv2.IMREAD_COLOR)
        prediction_img_encoded = image_to_base64(prediction_img)
        plt.close(fig)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error in OCR model processing: {e}")
    
    # Run Sakshi OCR on the image by saving temporarily
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_file:
            cv2.imwrite(tmp_file.name, img)
            tmp_file_path = tmp_file.name
        sakshi_output = run_sakshi_ocr(tmp_file_path)
        os.remove(tmp_file_path)
    except Exception as e:
        sakshi_output = f"Error running Sakshi OCR: {e}"
    
    # Prepare the response
    response_data = {
        "word_count": word_count,
        "ocr_prediction": pred_label,
        "sakshi_ocr_output": sakshi_output,
        "original_image": original_image,
        "word_detected_image": word_img_encoded,
        "prediction_image": prediction_img_encoded
    }
    
    return JSONResponse(content=response_data)

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)