Spaces:

sameernotes
/

ocr

Sleeping

App Files Files Community

sameernotes commited on Mar 19, 2025

Commit

214c905

verified ·

1 Parent(s): fc14550

Upload 2 files

Browse files

Files changed (2) hide show

app.py +156 -107
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,66 +1,101 @@
-import os
-import io
-import sys
 import cv2
-import base64
-import pickle
 import numpy as np
 import tensorflow as tf
 import matplotlib.pyplot as plt
 import matplotlib.font_manager as fm
-import tempfile
 import sakshi_ocr
-from fastapi import FastAPI, File, UploadFile, HTTPException
-from fastapi.responses import HTMLResponse, JSONResponse
-# Define paths to your assets (update these if necessary)
-MODEL_PATH = 'hindi_ocr_model.keras'
-ENCODER_PATH = 'label_encoder.pkl'
-FONT_PATH = 'NotoSansDevanagari-Regular.ttf'
-# Load custom font if available
-if os.path.exists(FONT_PATH):
-    fm.fontManager.addfont(FONT_PATH)
-    plt.rcParams['font.family'] = 'Noto Sans Devanagari'
-else:
-    print("Custom font not found. Using default font.")
-# Load the OCR model
 def load_model():
     if not os.path.exists(MODEL_PATH):
-        raise FileNotFoundError(f"Model file not found at {MODEL_PATH}")
     return tf.keras.models.load_model(MODEL_PATH)
-# Load the label encoder
 def load_label_encoder():
     if not os.path.exists(ENCODER_PATH):
-        raise FileNotFoundError(f"Label encoder file not found at {ENCODER_PATH}")
     with open(ENCODER_PATH, 'rb') as f:
         return pickle.load(f)
-# Global loading so they persist across requests
-model = load_model()
-label_encoder = load_label_encoder()
-# Function for word detection
 def detect_words(image):
-    # Assume input is a grayscale image
     _, binary = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-    kernel = np.ones((3, 3), np.uint8)
     dilated = cv2.dilate(binary, kernel, iterations=2)
     contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     word_img = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
     word_count = 0
     for contour in contours:
         x, y, w, h = cv2.boundingRect(contour)
         if w > 10 and h > 10:
             cv2.rectangle(word_img, (x, y), (x+w, y+h), (0, 255, 0), 2)
             word_count += 1
     return word_img, word_count
-# Function to run Sakshi OCR and capture its output
 def run_sakshi_ocr(image_path):
     buffer = io.StringIO()
     old_stdout = sys.stdout
@@ -71,96 +106,110 @@ def run_sakshi_ocr(image_path):
         sys.stdout = old_stdout
     return buffer.getvalue()
-# Utility function: convert image (numpy array) to a base64 encoded string
-def image_to_base64(image, ext=".png"):
-    success, encoded_image = cv2.imencode(ext, image)
-    if not success:
-        return None
-    return base64.b64encode(encoded_image).decode('utf-8')
-# Initialize FastAPI app
-app = FastAPI(title="Hindi OCR App by sakshi")
-@app.get("/", response_class=HTMLResponse)
-async def root():
-    html_content = """
-    <html>
-      <head>
-        <title>Hindi OCR App by sakshi</title>
-      </head>
-      <body>
-        <h1>Hindi OCR App by sakshi</h1>
-        <form action="/predict" enctype="multipart/form-data" method="post">
-          <input name="file" type="file" accept="image/*">
-          <input type="submit" value="Upload and Predict">
-        </form>
-      </body>
-    </html>
-    """
-    return HTMLResponse(content=html_content)
-@app.post("/predict")
-async def predict(file: UploadFile = File(...)):
-    # Read and decode the uploaded image
-    contents = await file.read()
-    nparr = np.frombuffer(contents, np.uint8)
-    img = cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE)
-    if img is None:
-        raise HTTPException(status_code=400, detail="Error reading the image.")
-    # Encode the original image to base64 for visualization
-    original_image = image_to_base64(cv2.cvtColor(img, cv2.COLOR_GRAY2BGR))
     # Word detection
-    word_img, word_count = detect_words(img)
-    word_img_encoded = image_to_base64(word_img)
-    # OCR model prediction for single word
     try:
         img_resized = cv2.resize(img, (128, 32))
         img_norm = img_resized / 255.0
-        img_input = img_norm[np.newaxis, ..., np.newaxis]  # shape: (1, 32, 128, 1)
-        pred = model.predict(img_input)
-        pred_label_idx = np.argmax(pred)
-        pred_label = label_encoder.inverse_transform([pred_label_idx])[0]
-        # Generate an image with the prediction using matplotlib
-        fig, ax = plt.subplots()
-        ax.imshow(img, cmap='gray')
-        ax.set_title(f"Predicted: {pred_label}", fontsize=12)
-        ax.axis('off')
-        buf = io.BytesIO()
-        plt.savefig(buf, format="png")
-        buf.seek(0)
-        pred_img_array = np.frombuffer(buf.getvalue(), np.uint8)
-        prediction_img = cv2.imdecode(pred_img_array, cv2.IMREAD_COLOR)
-        prediction_img_encoded = image_to_base64(prediction_img)
-        plt.close(fig)
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error in OCR model processing: {e}")
-    # Run Sakshi OCR on the image by saving temporarily
-    try:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_file:
-            cv2.imwrite(tmp_file.name, img)
-            tmp_file_path = tmp_file.name
-        sakshi_output = run_sakshi_ocr(tmp_file_path)
-        os.remove(tmp_file_path)
-    except Exception as e:
-        sakshi_output = f"Error running Sakshi OCR: {e}"
-    # Prepare the response
-    response_data = {
         "word_count": word_count,
-        "ocr_prediction": pred_label,
-        "sakshi_ocr_output": sakshi_output,
-        "original_image": original_image,
-        "word_detected_image": word_img_encoded,
-        "prediction_image": prediction_img_encoded
     }
-    return JSONResponse(content=response_data)
 if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import FileResponse, JSONResponse
+from pydantic import BaseModel
 import cv2
 import numpy as np
 import tensorflow as tf
+import pickle
 import matplotlib.pyplot as plt
 import matplotlib.font_manager as fm
 import sakshi_ocr
+import os
+import io
+import sys
+import tempfile
+import requests
+from PIL import Image
+import uvicorn
+import shutil
+from pathlib import Path
+app = FastAPI(
+    title="Hindi OCR API",
+    description="API for Hindi OCR and word detection",
+    version="1.0.0"
+)
+# URLs for the model and encoder hosted on Hugging Face
+MODEL_URL = "https://huggingface.co/sameernotes/hindi-ocr/resolve/main/hindi_ocr_model.keras"
+ENCODER_URL = "https://huggingface.co/sameernotes/hindi-ocr/resolve/main/label_encoder.pkl"
+FONT_URL = "https://huggingface.co/sameernotes/hindi-ocr/resolve/main/NotoSansDevanagari-Regular.ttf"
+# Paths for local storage
+MODEL_PATH = "hindi_ocr_model.keras"
+ENCODER_PATH = "label_encoder.pkl"
+FONT_PATH = "NotoSansDevanagari-Regular.ttf"
+OUTPUT_DIR = "output"
+# Create output directory if it doesn't exist
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# Download model and encoder
+def download_file(url, dest):
+    response = requests.get(url)
+    with open(dest, 'wb') as f:
+        f.write(response.content)
+# Load the model and encoder
 def load_model():
     if not os.path.exists(MODEL_PATH):
+        return None
     return tf.keras.models.load_model(MODEL_PATH)
 def load_label_encoder():
     if not os.path.exists(ENCODER_PATH):
+        return None
     with open(ENCODER_PATH, 'rb') as f:
         return pickle.load(f)
+# Download required files on startup
+@app.on_event("startup")
+async def startup_event():
+    # Download models and font if not already present
+    if not os.path.exists(MODEL_PATH):
+        download_file(MODEL_URL, MODEL_PATH)
+    if not os.path.exists(ENCODER_PATH):
+        download_file(ENCODER_URL, ENCODER_PATH)
+    if not os.path.exists(FONT_PATH):
+        download_file(FONT_URL, FONT_PATH)
+    # Load the custom font if available
+    if os.path.exists(FONT_PATH):
+        fm.fontManager.addfont(FONT_PATH)
+        plt.rcParams['font.family'] = 'Noto Sans Devanagari'
+    # Initialize global variables
+    global model, label_encoder
+    model = load_model()
+    label_encoder = load_label_encoder()
+# Word detection function
 def detect_words(image):
     _, binary = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    kernel = np.ones((3,3), np.uint8)
     dilated = cv2.dilate(binary, kernel, iterations=2)
     contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     word_img = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
     word_count = 0
     for contour in contours:
         x, y, w, h = cv2.boundingRect(contour)
         if w > 10 and h > 10:
             cv2.rectangle(word_img, (x, y), (x+w, y+h), (0, 255, 0), 2)
             word_count += 1
     return word_img, word_count
+# Sakshi OCR output capture
 def run_sakshi_ocr(image_path):
     buffer = io.StringIO()
     old_stdout = sys.stdout
         sys.stdout = old_stdout
     return buffer.getvalue()
+# Main OCR processing function
+def process_image(image_array):
+    # Convert image array to grayscale
+    img = cv2.cvtColor(image_array, cv2.COLOR_RGB2GRAY)
     # Word detection
+    word_detected_img, word_count = detect_words(img)
+    word_detection_path = os.path.join(OUTPUT_DIR, "word_detection.png")
+    cv2.imwrite(word_detection_path, word_detected_img)
+    # First OCR model prediction
     try:
         img_resized = cv2.resize(img, (128, 32))
         img_norm = img_resized / 255.0
+        img_input = img_norm[np.newaxis, ..., np.newaxis]  # Shape: (1, 32, 128, 1)
+        if model is not None and label_encoder is not None:
+            pred = model.predict(img_input)
+            pred_label_idx = np.argmax(pred)
+            pred_label = label_encoder.inverse_transform([pred_label_idx])[0]
+            # Create plot with prediction
+            fig, ax = plt.subplots()
+            ax.imshow(img, cmap='gray')
+            ax.set_title(f"Predicted: {pred_label}", fontsize=12)
+            ax.axis('off')
+            pred_path = os.path.join(OUTPUT_DIR, "prediction.png")
+            plt.savefig(pred_path)
+            plt.close()
+        else:
+            pred_path = None
+            pred_label = "Model or encoder not loaded"
     except Exception as e:
+        pred_path = None
+        pred_label = f"Error: {str(e)}"
+    # Sakshi OCR processing
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_file:
+        cv2.imwrite(tmp_file.name, img)
+        sakshi_output = run_sakshi_ocr(tmp_file.name)
+        os.remove(tmp_file.name)
+    return {
+        "sakshi_output": sakshi_output,
+        "word_detection_path": word_detection_path,
         "word_count": word_count,
+        "prediction_path": pred_path,
+        "prediction_label": pred_label
     }
+class OCRResponse(BaseModel):
+    sakshi_output: str
+    word_count: int
+    prediction_label: str
+@app.post("/process/", response_model=OCRResponse)
+async def process(file: UploadFile = File(...)):
+    # Check if the file is an image
+    if not file.content_type.startswith("image/"):
+        raise HTTPException(status_code=400, detail="File must be an image")
+    # Create a temporary file to save the uploaded image
+    temp_file = tempfile.NamedTemporaryFile(delete=False)
+    try:
+        # Save the uploaded file
+        with temp_file as f:
+            shutil.copyfileobj(file.file, f)
+        # Open and process the image
+        image = Image.open(temp_file.name)
+        image_array = np.array(image)
+        result = process_image(image_array)
+        return OCRResponse(
+            sakshi_output=result["sakshi_output"],
+            word_count=result["word_count"],
+            prediction_label=result["prediction_label"]
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")
+    finally:
+        # Clean up the temporary file
+        os.unlink(temp_file.name)
+@app.get("/word-detection/")
+async def get_word_detection():
+    """Return the word detection image."""
+    word_detection_path = Path(OUTPUT_DIR) / "word_detection.png"
+    if not word_detection_path.exists():
+        raise HTTPException(status_code=404, detail="Word detection image not found. Process an image first.")
+    return FileResponse(word_detection_path)
+@app.get("/prediction/")
+async def get_prediction():
+    """Return the prediction image."""
+    prediction_path = Path(OUTPUT_DIR) / "prediction.png"
+    if not prediction_path.exists():
+        raise HTTPException(status_code=404, detail="Prediction image not found. Process an image first.")
+    return FileResponse(prediction_path)
+@app.get("/")
+async def root():
+    return {"message": "Hindi OCR API is running. Use POST /process/ to analyze images."}
+# For local testing
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt CHANGED Viewed

@@ -7,4 +7,6 @@ opencv-python
 matplotlib
 scikit-learn
 python-multipart
-sakshi-ocr

 matplotlib
 scikit-learn
 python-multipart
+sakshi-ocr
+pydantic
+requests