Spaces:

Creator-090
/

isl-api

Sleeping

Creator-090 commited on 19 days ago

Commit

86c7cf3

1 Parent(s): 1cf4369

fix: CPU-safe inference for HF free tier

- Skip autocast and torch.compile on CPU
- Reduce warmup to 1 round on CPU (was 3, ~60s saved on cold start)
- Return 503 from /health while model is loading so wake_up() retries correctly

Files changed (2) hide show

app.py +13 -10
model.py +37 -60

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Global state
 model        = None
 model_loaded = False
 model_error  = None
@@ -43,7 +43,7 @@ async def startup_event():
         print("Model failed to load:", e)
-# Root
 @app.get("/")
 def root():
     return {
@@ -52,11 +52,15 @@ def root():
     }
-# Health
 @app.get("/health")
 def health():
     if not model_loaded or model is None:
-        return {"status": "error", "model_loaded": False, "error": model_error}
     return {
         "status":       "ok",
         "model_loaded": True,
@@ -65,7 +69,7 @@ def health():
     }
-# Deep health
 @app.get("/health/deep")
 def health_deep():
     if not model_loaded or model is None:
@@ -73,7 +77,6 @@ def health_deep():
     try:
         import torch
-        # Must match the dtype the model now runs in (FP16 on GPU)
         dummy = torch.zeros(1, 3, 16, 224, 224, device=DEVICE, dtype=_DTYPE)
         with torch.no_grad():
             _ = model(dummy)
@@ -82,9 +85,9 @@ def health_deep():
         raise HTTPException(status_code=500, detail=f"Inference failed: {str(e)}")
-# Predict from frames (real-time path)
 class FramesPayload(BaseModel):
-    frames: List[str]   # base64-encoded JPEG/PNG, exactly 16
     top_k:  int = 5
 @app.post("/predict_frames")
@@ -110,7 +113,7 @@ async def predict_frames_api(payload: FramesPayload):
     }
-# Predict from video file
 ALLOWED_EXTENSIONS = ('.mp4', '.mov', '.avi', '.mkv')
 @app.post("/predict")
@@ -138,6 +141,6 @@ async def predict_sign(file: UploadFile = File(...), top_k: int = 5):
     }
-# Entry point
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860)

     allow_headers=["*"],
 )
+#  Global state ─
 model        = None
 model_loaded = False
 model_error  = None
         print("Model failed to load:", e)
+#  Root ─
 @app.get("/")
 def root():
     return {
     }
+#  Health ─
 @app.get("/health")
 def health():
     if not model_loaded or model is None:
+        # Return 503 so the wake_up() retry loop in backend knows to keep waiting
+        raise HTTPException(
+            status_code=503,
+            detail={"status": "error", "model_loaded": False, "error": model_error}
+        )
     return {
         "status":       "ok",
         "model_loaded": True,
     }
+#  Deep health
 @app.get("/health/deep")
 def health_deep():
     if not model_loaded or model is None:
     try:
         import torch
         dummy = torch.zeros(1, 3, 16, 224, 224, device=DEVICE, dtype=_DTYPE)
         with torch.no_grad():
             _ = model(dummy)
         raise HTTPException(status_code=500, detail=f"Inference failed: {str(e)}")
+#  Predict from frames (real-time path) ─
 class FramesPayload(BaseModel):
+    frames: List[str]
     top_k:  int = 5
 @app.post("/predict_frames")
     }
+#  Predict from video file
 ALLOWED_EXTENSIONS = ('.mp4', '.mov', '.avi', '.mkv')
 @app.post("/predict")
     }
+#  Entry point
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860)

model.py CHANGED Viewed

@@ -8,7 +8,7 @@ from decord.bridge import set_bridge
 import cv2
 import numpy as np
-# Classes
 CLASSES = [
     'afternoon', 'animal', 'bad', 'beautiful', 'big', 'bird', 'blind',
     'cat', 'cheap', 'clothing', 'cold', 'cow', 'curved', 'deaf', 'dog',
@@ -22,23 +22,23 @@ CLASSES = [
     'warm', 'wednesday', 'week', 'wet', 'wide', 'year', 'yesterday', 'young'
 ]
-# Constants
 CLIP_LENGTH = 16
 DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-USE_FP16    = DEVICE.type == "cuda"
-# Global transform pipeline (built once, runs on GPU)
-# Replaces VivitImageProcessor - same operations, but GPU-accelerated via torchvision v2
-_DTYPE = torch.float16 if USE_FP16 else torch.float32
 TRANSFORMS = v2.Compose([
-    v2.Resize(224, antialias=True),                         # shortest edge → 224
-    v2.CenterCrop(224),                                     # 224×224
-    v2.ToDtype(_DTYPE, scale=True),                         # uint8 => float, /255
     v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
 ])
-# Model
 class SwinTClassifications(nn.Module):
     def __init__(self, classes, weights="KINETICS400_V1"):
         super().__init__()
@@ -56,10 +56,9 @@ class SwinTClassifications(nn.Module):
 def load_model():
-    """Downloads model from HF Hub, applies FP16 + torch.compile for max speed."""
     from huggingface_hub import hf_hub_download
-    print(f"Loading model on {DEVICE} (fp16={USE_FP16}) ...")
     model_path = hf_hub_download(
         repo_id="Creator-090/isl-swin3d-model",
         filename="ISL_best_model.pt"
@@ -71,27 +70,25 @@ def load_model():
     )
     model = model.to(DEVICE)
-    # FP16 on GPU - ~2x faster inference, no accuracy loss for classification
     if USE_FP16:
         model = model.half()
     model.eval()
-    # torch.compile - fuses ops, reduces Python overhead (~20-35% faster after warmup)
     if DEVICE.type == "cuda":
-        print("Compiling model with torch.compile (mode=reduce-overhead) ...")
         model = torch.compile(model, mode="reduce-overhead")
-    # Warmup - triggers compilation + CUDA kernel caching so first real request is fast
     _warmup(model)
     print("Model ready.")
     return model
-def _warmup(model, rounds: int = 3):
-    """Run a few dummy forward passes to trigger torch.compile and warm CUDA kernels."""
-    print(f"Warming up model ({rounds} rounds) ...")
     dummy = torch.zeros(1, 3, CLIP_LENGTH, 224, 224, device=DEVICE, dtype=_DTYPE)
     with torch.no_grad():
         for _ in range(rounds):
@@ -101,92 +98,74 @@ def _warmup(model, rounds: int = 3):
     print("Warmup complete.")
-# Preprocessing helpers
 def _frames_to_tensor(frames: list) -> torch.Tensor:
-    """
-    Converts a list of numpy (H,W,3) RGB frames → (1, C, T, H, W) tensor on DEVICE.
-    Resize + normalize happen on GPU via torchvision v2 transforms.
-    """
-    # Stack => (T, C, H, W) uint8
     video = torch.stack([
-        torch.from_numpy(f).permute(2, 0, 1)   # H,W,C => C,H,W
         for f in frames
-    ])                                           # (T, C, H, W)
-    video = video.to(DEVICE)                    # move to GPU first, then transform
-    video = TRANSFORMS(video)                   # resize + crop + normalize on GPU => (T, C, H, W)
-    video = video.permute(1, 0, 2, 3)           # (C, T, H, W) =>  Swin3D expects this
-    return video.unsqueeze(0)                   # (1, C, T, H, W)
 def _pad_or_trim(frames: list, clip_length: int) -> list:
     if len(frames) < clip_length:
         frames += [frames[-1]] * (clip_length - len(frames))
     elif len(frames) > clip_length:
-        # Uniform temporal sampling instead of naive truncation
         indices = [int(i * len(frames) / clip_length) for i in range(clip_length)]
         frames  = [frames[i] for i in indices]
     return frames
 def preprocess_video(video_bytes: bytes, clip_length: int = CLIP_LENGTH) -> torch.Tensor:
-    """
-    Decodes a video from raw bytes (no disk I/O) and returns a model-ready tensor.
-    Uses decord's in-memory VideoReader to avoid the tempfile write/read cycle.
-    """
     set_bridge("torch")
-    vr     = VideoReader(io.BytesIO(video_bytes))           # in-memory, no disk write
-    total  = len(vr)
-    idx    = list(range(min(total, clip_length)))
     if len(idx) < clip_length:
         idx += [idx[-1]] * (clip_length - len(idx))
-    batch  = vr.get_batch(idx).asnumpy()                    # (T, H, W, C) uint8 numpy
-    frames = [batch[i] for i in range(batch.shape[0])]      # list of (H, W, C)
     return _frames_to_tensor(frames)
 def preprocess_frames(frames_list_bytes: list[bytes], clip_length: int = CLIP_LENGTH) -> torch.Tensor:
-    """
-    Decodes a list of JPEG/PNG frame bytes and returns a model-ready tensor.
-    All heavy lifting (resize, normalize) happens on GPU.
-    """
     frames = []
     for fb in frames_list_bytes:
         arr = np.frombuffer(fb, np.uint8)
         img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
         if img is None:
             continue
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)          # BGR → RGB
         frames.append(img)
     if not frames:
-        raise ValueError("No valid frames could be decoded from the provided bytes.")
     frames = _pad_or_trim(frames, clip_length)
     return _frames_to_tensor(frames)
 # Inference
 def _run_inference(model, pixel_values: torch.Tensor, top_k: int) -> dict:
-    """Shared inference logic for both predict paths."""
     with torch.no_grad():
-        # autocast is a no-op on CPU; on GPU it enforces FP16 even if something slipped through
-        with torch.autocast(device_type=DEVICE.type, dtype=_DTYPE, enabled=USE_FP16):
             outputs = model(pixel_values)
         probs = torch.nn.functional.softmax(outputs, dim=-1)[0]
     top_probs, top_indices = torch.topk(probs, k=top_k)
     results = [
         {"class": CLASSES[top_indices[i].item()], "confidence": float(top_probs[i].item())}
         for i in range(top_k)
     ]
     return {
         "prediction": results[0]["class"],
         "confidence": results[0]["confidence"],
@@ -195,12 +174,10 @@ def _run_inference(model, pixel_values: torch.Tensor, top_k: int) -> dict:
 def predict(model, video_bytes: bytes, top_k: int = 5) -> dict:
-    """Inference from raw video bytes."""
     pixel_values = preprocess_video(video_bytes)
     return _run_inference(model, pixel_values, top_k)
 def predict_from_frames(model, frames_list_bytes: list[bytes], top_k: int = 5) -> dict:
-    """Inference from a list of raw JPEG/PNG frame bytes."""
     pixel_values = preprocess_frames(frames_list_bytes)
     return _run_inference(model, pixel_values, top_k)

 import cv2
 import numpy as np
+#  Classes
 CLASSES = [
     'afternoon', 'animal', 'bad', 'beautiful', 'big', 'bird', 'blind',
     'cat', 'cheap', 'clothing', 'cold', 'cow', 'curved', 'deaf', 'dog',
     'warm', 'wednesday', 'week', 'wet', 'wide', 'year', 'yesterday', 'young'
 ]
+#  Constants
 CLIP_LENGTH = 16
 DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+USE_FP16    = DEVICE.type == "cuda"   # False on HF free tier (CPU only)
+_DTYPE      = torch.float16 if USE_FP16 else torch.float32
+print(f"[model] device={DEVICE} | fp16={USE_FP16} | dtype={_DTYPE}")
+# Global transform pipeline (built once)
 TRANSFORMS = v2.Compose([
+    v2.Resize(224, antialias=True),
+    v2.CenterCrop(224),
+    v2.ToDtype(_DTYPE, scale=True),
     v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
 ])
+# Model
 class SwinTClassifications(nn.Module):
     def __init__(self, classes, weights="KINETICS400_V1"):
         super().__init__()
 def load_model():
     from huggingface_hub import hf_hub_download
+    print(f"Loading model on {DEVICE} ...")
     model_path = hf_hub_download(
         repo_id="Creator-090/isl-swin3d-model",
         filename="ISL_best_model.pt"
     )
     model = model.to(DEVICE)
     if USE_FP16:
         model = model.half()
     model.eval()
+    # torch.compile only on CUDA — can error or be very slow on CPU
     if DEVICE.type == "cuda":
+        print("Compiling model with torch.compile ...")
         model = torch.compile(model, mode="reduce-overhead")
     _warmup(model)
     print("Model ready.")
     return model
+def _warmup(model):
+    # 1 round on CPU (warmup is slow ~30s on CPU Swin3D), 3 on GPU
+    rounds = 1 if DEVICE.type == "cpu" else 3
+    print(f"Warming up ({rounds} round(s) on {DEVICE}) ...")
     dummy = torch.zeros(1, 3, CLIP_LENGTH, 224, 224, device=DEVICE, dtype=_DTYPE)
     with torch.no_grad():
         for _ in range(rounds):
     print("Warmup complete.")
+# Preprocessing
 def _frames_to_tensor(frames: list) -> torch.Tensor:
     video = torch.stack([
+        torch.from_numpy(f).permute(2, 0, 1)
         for f in frames
+    ])                                   # (T, C, H, W) uint8
+    video = video.to(DEVICE)
+    video = TRANSFORMS(video)           # (T, C, H, W) float
+    video = video.permute(1, 0, 2, 3)  # (C, T, H, W)
+    return video.unsqueeze(0)           # (1, C, T, H, W)
 def _pad_or_trim(frames: list, clip_length: int) -> list:
     if len(frames) < clip_length:
         frames += [frames[-1]] * (clip_length - len(frames))
     elif len(frames) > clip_length:
         indices = [int(i * len(frames) / clip_length) for i in range(clip_length)]
         frames  = [frames[i] for i in indices]
     return frames
 def preprocess_video(video_bytes: bytes, clip_length: int = CLIP_LENGTH) -> torch.Tensor:
     set_bridge("torch")
+    vr    = VideoReader(io.BytesIO(video_bytes))
+    total = len(vr)
+    idx   = list(range(min(total, clip_length)))
     if len(idx) < clip_length:
         idx += [idx[-1]] * (clip_length - len(idx))
+    batch  = vr.get_batch(idx).asnumpy()
+    frames = [batch[i] for i in range(batch.shape[0])]
     return _frames_to_tensor(frames)
 def preprocess_frames(frames_list_bytes: list[bytes], clip_length: int = CLIP_LENGTH) -> torch.Tensor:
     frames = []
     for fb in frames_list_bytes:
         arr = np.frombuffer(fb, np.uint8)
         img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
         if img is None:
             continue
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
         frames.append(img)
     if not frames:
+        raise ValueError("No valid frames could be decoded.")
     frames = _pad_or_trim(frames, clip_length)
     return _frames_to_tensor(frames)
 # Inference
 def _run_inference(model, pixel_values: torch.Tensor, top_k: int) -> dict:
     with torch.no_grad():
+        if USE_FP16:
+            # autocast only valid on CUDA
+            with torch.autocast(device_type="cuda", dtype=torch.float16):
+                outputs = model(pixel_values)
+        else:
+            # CPU path — plain fp32, no autocast
             outputs = model(pixel_values)
         probs = torch.nn.functional.softmax(outputs, dim=-1)[0]
     top_probs, top_indices = torch.topk(probs, k=top_k)
     results = [
         {"class": CLASSES[top_indices[i].item()], "confidence": float(top_probs[i].item())}
         for i in range(top_k)
     ]
     return {
         "prediction": results[0]["class"],
         "confidence": results[0]["confidence"],
 def predict(model, video_bytes: bytes, top_k: int = 5) -> dict:
     pixel_values = preprocess_video(video_bytes)
     return _run_inference(model, pixel_values, top_k)
 def predict_from_frames(model, frames_list_bytes: list[bytes], top_k: int = 5) -> dict:
     pixel_values = preprocess_frames(frames_list_bytes)
     return _run_inference(model, pixel_values, top_k)