Spaces:
Sleeping
Sleeping
File size: 7,126 Bytes
5322ae1 e06c4c9 5322ae1 52aad81 5322ae1 52aad81 5322ae1 52aad81 5322ae1 fb2b1e1 e06c4c9 5322ae1 ebdf66c e06c4c9 5322ae1 e06c4c9 5322ae1 e06c4c9 5322ae1 52aad81 5322ae1 52aad81 5322ae1 52aad81 5322ae1 e06c4c9 5322ae1 e06c4c9 5322ae1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 | """
FastAPI application for Sign Language Recognition API
"""
import os
import torch
import numpy as np
import joblib
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Dict
import mediapipe as mp
from datetime import datetime
from model import CustomLSTM
from preprocessing import decode_base64_image, process_frame
# Version and deployment info
VERSION = "2.0.0"
MODEL_VERSION = "CV_Test-2026-01-05" # Updated with 'hi' gesture support
LAST_UPDATED = "2026-01-05T10:00:00Z"
# Initialize FastAPI app
app = FastAPI(
title="Sign Language Recognition API",
description="Real-time Malaysian Sign Language (MSL) recognition using MediaPipe and LSTM",
version=VERSION
)
# Enable CORS for web app access
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allow all origins (adjust in production)
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Gesture classes
GESTURES = [
'minum', 'berjalan', 'berlari', 'bola', 'dari', 'hi',
'jangan', 'mohon', 'pen', 'teh tarik', 'tolong'
]
# Configuration
INPUT_SIZE = 258
HIDDEN_SIZE = 64
NUM_CLASSES = len(GESTURES)
SEQUENCE_LENGTH = 30
CONFIDENCE_THRESHOLD = 0.7
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Load model
model = CustomLSTM(INPUT_SIZE, HIDDEN_SIZE, NUM_CLASSES).to(device)
model_path = "trained_model.pth"
scaler_path = "scaler.bin"
try:
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()
print(f"Model loaded successfully from {model_path}")
except Exception as e:
print(f"Error loading model: {e}")
raise
# Load scaler
try:
scaler = joblib.load(scaler_path)
print(f"Scaler loaded successfully from {scaler_path}")
except Exception as e:
print(f"Error loading scaler: {e}")
raise
# Initialize MediaPipe
mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(
min_detection_confidence=0.5,
min_tracking_confidence=0.5
)
# Store sequences for each session (in production, use Redis or similar)
# Key: session_id, Value: list of keypoints
sequences = {}
# Request/Response models
class FrameRequest(BaseModel):
frame: str # Base64 encoded image
session_id: str = "default"
class PredictionResponse(BaseModel):
gesture: str
confidence: float
all_predictions: Dict[str, float]
sequence_length: int
message: str
class HealthResponse(BaseModel):
status: str
device: str
model_loaded: bool
gestures: List[str]
version: str
model_version: str
last_updated: str
num_classes: int
@app.get("/", response_model=HealthResponse)
async def root():
"""Health check endpoint"""
return {
"status": "healthy",
"device": str(device),
"model_loaded": True,
"gestures": GESTURES,
"version": VERSION,
"model_version": MODEL_VERSION,
"last_updated": LAST_UPDATED,
"num_classes": NUM_CLASSES
}
@app.get("/health", response_model=HealthResponse)
async def health():
"""Detailed health check"""
return {
"status": "healthy",
"device": str(device),
"model_loaded": True,
"gestures": GESTURES,
"version": VERSION,
"model_version": MODEL_VERSION,
"last_updated": LAST_UPDATED,
"num_classes": NUM_CLASSES
}
@app.post("/predict", response_model=PredictionResponse)
async def predict(request: FrameRequest):
"""
Process a single frame and return prediction.
The API maintains a sequence buffer for each session_id.
Predictions are only made when 30 frames have been accumulated.
Args:
request: FrameRequest containing base64 encoded frame and session_id
Returns:
PredictionResponse with gesture prediction and confidence
"""
try:
# Decode frame
frame = decode_base64_image(request.frame)
# Process frame and extract keypoints
keypoints = process_frame(frame, holistic)
# Initialize session if not exists
if request.session_id not in sequences:
sequences[request.session_id] = []
# Add keypoints to sequence (only if hands detected)
if keypoints is not None:
sequences[request.session_id].append(keypoints)
# Keep only last 30 frames
sequences[request.session_id] = sequences[request.session_id][-SEQUENCE_LENGTH:]
# Get current sequence length
current_length = len(sequences[request.session_id])
# Check if we have enough frames for prediction
if current_length < SEQUENCE_LENGTH:
return {
"gesture": "collecting_frames",
"confidence": 0.0,
"all_predictions": {},
"sequence_length": current_length,
"message": f"Collecting frames... {current_length}/{SEQUENCE_LENGTH}"
}
# Make prediction
sequence = sequences[request.session_id][-SEQUENCE_LENGTH:]
sequence_array = np.array(sequence)
# Apply scaler transformation
sequence_scaled = scaler.transform(sequence_array)
input_tensor = torch.tensor(
np.expand_dims(sequence_scaled, axis=0),
dtype=torch.float32
).to(device)
with torch.no_grad():
output = model(input_tensor)
probabilities = torch.softmax(output, dim=1)[0]
# Get predictions
max_prob = torch.max(probabilities).item()
max_idx = torch.argmax(probabilities).item()
predicted_gesture = GESTURES[max_idx]
# Create all predictions dict
all_preds = {
GESTURES[i]: float(probabilities[i].item())
for i in range(len(GESTURES))
}
# Determine message
if max_prob >= CONFIDENCE_THRESHOLD:
message = f"Predicted: {predicted_gesture}"
else:
message = "Low confidence - keep signing"
return {
"gesture": predicted_gesture,
"confidence": max_prob,
"all_predictions": all_preds,
"sequence_length": current_length,
"message": message
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")
@app.post("/reset")
async def reset_session(session_id: str = "default"):
"""
Reset the sequence buffer for a session.
Args:
session_id: Session identifier
Returns:
Success message
"""
if session_id in sequences:
sequences[session_id] = []
return {"message": f"Session {session_id} reset successfully"}
@app.get("/gestures")
async def get_gestures():
"""
Get list of all supported gestures.
Returns:
List of gesture names
"""
return {"gestures": GESTURES, "count": len(GESTURES)}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
|