Spaces:

Angstormy
/

hindi-ocr-api

Runtime error

App Files Files Community

Angstormy commited on Apr 1

Commit

7b2d644

verified ·

1 Parent(s): f71d7e4

Upload 5 Files

Browse files

Files changed (5) hide show

Dockerfile +19 -0
api.py +197 -0
best_model_20k.pt +3 -0
requirements.txt +10 -0
vocab.json +103 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install minimal system dependencies
+RUN apt-get update && apt-get install -y libglib2.0-0 && rm -rf /var/lib/apt/lists/*
+# Install Python requirements
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy all the model files and api.py
+COPY . .
+# Expose port 7860 (Hugging Face Spaces default port)
+EXPOSE 7860
+# Start the FastAPI server using uvicorn
+CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]

api.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+import torch
+import torch.nn as nn
+import torchvision.models as models
+import torchvision.transforms as transforms
+from PIL import Image, ImageOps
+import json
+import io
+import os
+import cv2
+import numpy as np
+import base64
+import math
+app = FastAPI()
+# Allow CORS for React development
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# --- Model Architecture (LITERAL SYNC from Training3.ipynb) ---
+class PositionalEncoding1D(nn.Module):
+    def __init__(self, d_model, max_len=512):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0))
+    def forward(self, x):
+        return x + self.pe[:, :x.size(1)]
+class OCRModel(nn.Module):
+    def __init__(self, vocab_size):
+        super().__init__()
+        # EXACT LAYER NAMES FROM CHECKPOINT
+        resnet = models.resnet34(weights=None)
+        self.encoder = nn.Sequential(*list(resnet.children())[:-2])
+        self.enc_proj = nn.Conv2d(512, 256, 1)
+        self.token_embed = nn.Embedding(vocab_size, 256)
+        self.pos_decoder = PositionalEncoding1D(256)
+        decoder_layer = nn.TransformerDecoderLayer(d_model=256, nhead=4, dim_feedforward=1024, batch_first=True)
+        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=4)
+        self.output_layer = nn.Linear(256, vocab_size)
+    def forward(self, images, tgt):
+        feat = self.encoder(images)
+        feat = self.enc_proj(feat)
+        memory = feat.flatten(2).permute(0, 2, 1)
+        tgt = self.token_embed(tgt)
+        tgt = self.pos_decoder(tgt)
+        mask = torch.triu(torch.ones(tgt.size(1), tgt.size(1), device=tgt.device), 1).bool()
+        out = self.decoder(tgt, memory, tgt_mask=mask)
+        return self.output_layer(out)
+def fuzzy_correct(text):
+    """Pass-through: Identity logic for literal model verification."""
+    return text
+# --- Global Resources ---
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = None
+stoi = None
+itos = None
+def load_resources():
+    global model, stoi, itos
+    model_path = "best_model_20k.pt"
+    vocab_path = "vocab.json"
+    if os.path.exists(model_path):
+        checkpoint = torch.load(model_path, map_location=device)
+        if isinstance(checkpoint, dict) and "model_state_dict" in checkpoint:
+            state_dict = checkpoint["model_state_dict"]
+            stoi = checkpoint["stoi"]
+            itos = {int(k): v for k, v in checkpoint["itos"].items()}
+        else:
+            state_dict = checkpoint
+            if os.path.exists(vocab_path):
+                with open(vocab_path, "r", encoding="utf-8") as f:
+                    vdata = json.load(f)
+                    stoi = vdata["stoi"]
+                    itos = {int(k): v for k, v in vdata["itos"].items()}
+        vocab_size = len(stoi)
+        model = OCRModel(vocab_size).to(device)
+        # STRICT=TRUE IS NOW ENABLED
+        model.load_state_dict(state_dict, strict=True)
+        model.eval()
+        print(f"✅ Checkpoint mapped perfectly to brain memory ({vocab_size} classes).")
+def preprocess_image(image_bytes):
+    # 1. Load with OpenCV (for smart-focus extraction)
+    nparr = np.frombuffer(image_bytes, np.uint8)
+    img_gray = cv2.imdecode(nparr, cv2.IMREAD_GRAYSCALE)
+    # 2. Find the Word bounding box (Otsu Binarization)
+    _, thresh = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    coords = cv2.findNonZero(thresh)
+    if coords is not None:
+        x, y, w, h = cv2.boundingRect(coords)
+        # 5% padding gives the best raw convolution feature mapping
+        pad_x, pad_y = int(w * 0.05), int(h * 0.05)
+        y_max, x_max = img_gray.shape
+        x1, y1 = max(0, x - pad_x), max(0, y - pad_y)
+        x2, y2 = min(x_max, x + w + pad_x), min(y_max, y + h + pad_y)
+        img_cropped = img_gray[y1:y2, x1:x2]
+    else:
+        img_cropped = img_gray
+    # 3. LITERAL NOTEBOOK TRANSFORM (Applied on Focused Word)
+    pil_img = Image.fromarray(img_cropped).convert("L")
+    # Resize exactly as notebook (IMG_HEIGHT=48, MAX_WIDTH=160)
+    pil_img = pil_img.resize((160, 48), Image.BILINEAR)
+    # 3-Channel Grayscale (Exact Notebook Sync)
+    pil_img = transforms.Compose([
+        transforms.Grayscale(num_output_channels=3),
+        transforms.ToTensor()
+    ])(pil_img)
+    img_tensor = pil_img.unsqueeze(0).to(device)
+    # UI View (Debug log)
+    debug_arr = (img_tensor.squeeze(0).permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+    debug_arr = cv2.cvtColor(debug_arr, cv2.COLOR_RGB2BGR)
+    _, buffer = cv2.imencode('.png', debug_arr)
+    debug_b64 = base64.b64encode(buffer).decode('utf-8')
+    return img_tensor, debug_b64
+@app.on_event("startup")
+async def startup_event():
+    load_resources()
+def greedy_decode(model, images, max_len=25):
+    """Refined Greedy Decode with special token verification."""
+    B = images.size(0)
+    BOS_VAL = stoi.get("<bos>", 1)
+    EOS_VAL = stoi.get("<eos>", 2)
+    PAD_VAL = stoi.get("<pad>", 0)
+    decoded = torch.full((B, 1), BOS_VAL, dtype=torch.long, device=device)
+    for _ in range(max_len):
+        with torch.cuda.amp.autocast():
+            logits = model(images, decoded)
+        next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)
+        decoded = torch.cat([decoded, next_token], dim=1)
+        if next_token.item() == EOS_VAL: break
+    ids = decoded[0].tolist()
+    # Decode string exactly as notebook does
+    out = []
+    for i in ids:
+        if i == EOS_VAL: break
+        if i in [PAD_VAL, BOS_VAL]: continue
+        out.append(itos.get(i, ""))
+    return "".join(out)
+@app.post("/predict")
+async def predict_ocr(file: UploadFile = File(...)):
+    if model is None: return {"error": "Model not loaded"}
+    try:
+        image_bytes = await file.read()
+        images, debug_b64 = preprocess_image(image_bytes)
+        prediction = greedy_decode(model, images)
+        final_prediction = fuzzy_correct(prediction)
+        print(f"RECOGNIZED: '{prediction}' -> FINAL: '{final_prediction}'")
+        return {
+            "prediction": final_prediction,
+            "engine_view": f"data:image/png;base64,{debug_b64}"
+        }
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return {"error": str(e)}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

best_model_20k.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f808a21186c88003997e034616c6c8310ca9bbd5456830814b17c86c380d75b
+size 309043646

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi
+uvicorn
+python-multipart
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch
+torchvision
+opencv-python-headless
+numpy
+pillow
+pydantic

vocab.json ADDED Viewed

	@@ -0,0 +1,103 @@

+[
+  "<pad>",
+  "<bos>",
+  "<eos>",
+  "<unk>",
+  "-",
+  "ँ",
+  "ं",
+  "ः",
+  "अ",
+  "आ",
+  "इ",
+  "ई",
+  "उ",
+  "ऊ",
+  "ऋ",
+  "ऌ",
+  "ऍ",
+  "ऎ",
+  "ए",
+  "ऐ",
+  "ऑ",
+  "ओ",
+  "औ",
+  "क",
+  "ख",
+  "ग",
+  "घ",
+  "ङ",
+  "च",
+  "छ",
+  "ज",
+  "झ",
+  "ञ",
+  "ट",
+  "ठ",
+  "ड",
+  "ढ",
+  "ण",
+  "त",
+  "थ",
+  "द",
+  "ध",
+  "न",
+  "ऩ",
+  "प",
+  "फ",
+  "ब",
+  "भ",
+  "म",
+  "य",
+  "र",
+  "ऱ",
+  "ल",
+  "ऴ",
+  "व",
+  "श",
+  "ष",
+  "स",
+  "ह",
+  "़",
+  "ऽ",
+  "ा",
+  "ि",
+  "ी",
+  "ु",
+  "ू",
+  "ृ",
+  "ॄ",
+  "ॅ",
+  "े",
+  "ै",
+  "ॉ",
+  "ॊ",
+  "ो",
+  "ौ",
+  "्",
+  "ॐ",
+  "॑",
+  "॒",
+  "॓",
+  "ॠ",
+  "ॢ",
+  "।",
+  "॥",
+  "०",
+  "१",
+  "२",
+  "३",
+  "४",
+  "५",
+  "६",
+  "७",
+  "८",
+  "९",
+  "॰",
+  "ॱ",
+  "ॲ",
+  "ॻ",
+  "ॼ",
+  "ॽ",
+  "ॾ"
+]