Spaces:

Creator-090
/

isl-api

Sleeping

App Files Files Community

Creator-090 commited on 26 days ago

Commit

de9af52

2 Parent(s): 053568e d91411e

feat: add /predict_frames endpoint and frame-based preprocessing to FastAPI service

Browse files

Files changed (2) hide show

app.py +43 -0
model.py +73 -0

app.py CHANGED Viewed

@@ -3,10 +3,14 @@ from fastapi import FastAPI, File, UploadFile, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 import uvicorn
 from model import load_model, predict, predict_from_frames
 import time
 from pydantic import BaseModel
 from typing import List
 import base64
 app = FastAPI(
     title="ISL Recognition API",
@@ -14,20 +18,28 @@ app = FastAPI(
     version="1.0.0"
 )
 # Allow all origins (for Flutter / frontend apps)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_methods=["*"],
     allow_headers=["*"],
 )
 # Global state
 model = None
 model_loaded = False
 model_error = None
 #  STARTUP
 @app.on_event("startup")
 async def startup_event():
@@ -42,7 +54,19 @@ async def startup_event():
         model_error = str(e)
         print("Model failed to load:", e)
 #  ROOT
 @app.get("/")
 def root():
@@ -131,6 +155,7 @@ async def predict_frames_api(payload: FramesPayload):
 #  PREDICT
 @app.post("/predict")
 async def predict_sign(file: UploadFile = File(...), top_k: int = 5):
     # Validate file type
     if not file.filename.lower().endswith(('.mp4', '.mov', '.avi', '.mkv')):
         raise HTTPException(
@@ -139,6 +164,12 @@ async def predict_sign(file: UploadFile = File(...), top_k: int = 5):
         )
     # Ensure model is ready
     if not model_loaded or model is None:
         raise HTTPException(
             status_code=503,
@@ -148,6 +179,15 @@ async def predict_sign(file: UploadFile = File(...), top_k: int = 5):
     start_time = time.time()
     video_bytes = await file.read()
     try:
         result = predict(model, video_bytes, top_k=top_k)
     except Exception as e:
@@ -159,8 +199,11 @@ async def predict_sign(file: UploadFile = File(...), top_k: int = 5):
     result["inference_time_ms"] = round((time.time() - start_time) * 1000, 2)
     result["filename"] = file.filename
     return result
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860)

 from fastapi.middleware.cors import CORSMiddleware
 import uvicorn
 from model import load_model, predict, predict_from_frames
+from model import load_model, predict, predict_from_frames
 import time
 from pydantic import BaseModel
 from typing import List
 import base64
+from pydantic import BaseModel
+from typing import List
+import base64
 app = FastAPI(
     title="ISL Recognition API",
     version="1.0.0"
 )
+# Allow all origins (for Flutter / frontend apps)
 # Allow all origins (for Flutter / frontend apps)
 app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
     CORSMiddleware,
     allow_origins=["*"],
     allow_methods=["*"],
     allow_headers=["*"],
 )
+# Global state
 # Global state
 model = None
 model_loaded = False
 model_error = None
+model_loaded = False
+model_error = None
+#  STARTUP
 #  STARTUP
 @app.on_event("startup")
 async def startup_event():
         model_error = str(e)
         print("Model failed to load:", e)
+    global model, model_loaded, model_error
+    try:
+        model = load_model()
+        model_loaded = True
+        model_error = None
+        print("Model loaded and API is ready!")
+    except Exception as e:
+        model_loaded = False
+        model_error = str(e)
+        print("Model failed to load:", e)
+#  ROOT
 #  ROOT
 @app.get("/")
 def root():
 #  PREDICT
 @app.post("/predict")
 async def predict_sign(file: UploadFile = File(...), top_k: int = 5):
+    # Validate file type
     # Validate file type
     if not file.filename.lower().endswith(('.mp4', '.mov', '.avi', '.mkv')):
         raise HTTPException(
         )
     # Ensure model is ready
+    if not model_loaded or model is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Model is not ready"
+        )
+    # Ensure model is ready
     if not model_loaded or model is None:
         raise HTTPException(
             status_code=503,
     start_time = time.time()
     video_bytes = await file.read()
+    try:
+        result = predict(model, video_bytes, top_k=top_k)
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Inference error: {str(e)}"
+        )
     try:
         result = predict(model, video_bytes, top_k=top_k)
     except Exception as e:
     result["inference_time_ms"] = round((time.time() - start_time) * 1000, 2)
     result["filename"] = file.filename
     return result
 if __name__ == "__main__":
+    uvicorn.run("app:app", host="0.0.0.0", port=7860)
     uvicorn.run("app:app", host="0.0.0.0", port=7860)

model.py CHANGED Viewed

@@ -10,6 +10,8 @@ import tempfile
 import os
 import cv2
 import numpy as np
 # Exactly 76 classes from your notebook metadata
 CLASSES = [
@@ -181,6 +183,77 @@ def predict_from_frames(model, frames_list_bytes: list[bytes], top_k: int = 5):
         "top_k": results
     }
 def predict(model, video_bytes: bytes, top_k: int = 5):
     """Runs inference and returns the top results"""
     pixel_values = preprocess_video(video_bytes).to(DEVICE)

 import os
 import cv2
 import numpy as np
+import cv2
+import numpy as np
 # Exactly 76 classes from your notebook metadata
 CLASSES = [
         "top_k": results
     }
+def preprocess_frames(frames_list_bytes: list[bytes], clip_length: int = 16):
+    """
+    Processes a list of raw frame bytes (JPEG/PNG encoded) into the Swin3D model input format.
+    Eliminates video encoding/decoding and disk I/O.
+    """
+    image_processor = VivitImageProcessor(
+        do_resize=True,
+        size={"shortest_edge": 224},
+        do_center_crop=True,
+        crop_size={"height": 224, "width": 224},
+        do_rescale=True,
+        rescale_factor=1/255,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    )
+    frames = []
+    for frame_bytes in frames_list_bytes:
+        # Decode image from bytes
+        nparr = np.frombuffer(frame_bytes, np.uint8)
+        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        if img is not None:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            frames.append(img)
+    if not frames:
+        raise ValueError("No valid frames decoded")
+    # Temporal sampling/padding
+    if len(frames) < clip_length:
+        frames += [frames[-1]] * (clip_length - len(frames))
+    elif len(frames) > clip_length:
+        frames = frames[:clip_length]
+    # Processor expects list of numpy arrays (H, W, C)
+    processed = image_processor(
+        frames,
+        return_tensors='pt',
+        # image_processor handles (T, C, H, W) return with return_tensors='pt'
+        # but we need to check internal dimension order
+    )
+    pixel_values = processed['pixel_values'].squeeze(0) # (T, C, H, W)
+    pixel_values = pixel_values.permute(1, 0, 2, 3)    # (C, T, H, W) for Swin3D
+    return pixel_values.unsqueeze(0)
+def predict_from_frames(model, frames_list_bytes: list[bytes], top_k: int = 5):
+    """Runs inference from raw frame bytes"""
+    pixel_values = preprocess_frames(frames_list_bytes).to(DEVICE)
+    with torch.no_grad():
+        outputs = model(pixel_values)
+        probabilities = torch.nn.functional.softmax(outputs, dim=-1)[0]
+    top_probs, top_indices = torch.topk(probabilities, k=top_k)
+    results = []
+    for i in range(top_k):
+        results.append({
+            "class": CLASSES[top_indices[i].item()],
+            "confidence": float(top_probs[i].item())
+        })
+    return {
+        "prediction": results[0]["class"],
+        "confidence": results[0]["confidence"],
+        "top_k": results
+    }
 def predict(model, video_bytes: bytes, top_k: int = 5):
     """Runs inference and returns the top results"""
     pixel_values = preprocess_video(video_bytes).to(DEVICE)