magboola's picture
fix: hardcode cuda in GPU functions for ZeroGPU compatibility
69554dd verified
"""
InternVideo2.5 ZeroGPU HuggingFace Space
Provides action recognition API for EagleEye video analysis.
Uses InternVideo2.5-8B for SOTA open-vocabulary action detection (92.1% on Kinetics-400).
"""
from __future__ import annotations
import base64
import gc
import io
import json
import os
import traceback
from typing import Any
import cv2
import gradio as gr
import numpy as np
import torch
from PIL import Image
try:
import spaces
ZEROGPU_AVAILABLE = True
except ImportError:
ZEROGPU_AVAILABLE = False
class _spaces_stub:
@staticmethod
def GPU(duration=60):
def decorator(func):
return func
return decorator
spaces = _spaces_stub()
MODEL_NAME = "OpenGVLab/InternVideo2_5_Chat_8B"
model = None
tokenizer = None
_is_fallback = False
DEFAULT_ACTION_CANDIDATES = [
"tackling", "wrestling", "headbutting", "punching", "kicking",
"pushing", "slapping", "sword fighting", "shooting goal (soccer)",
"dribbling basketball", "passing soccer ball", "catching or throwing baseball",
"shooting basketball", "dunking basketball", "heading ball",
"volleyball spiking", "playing tennis", "playing badminton",
"running", "jogging", "walking", "jumping", "diving", "swimming",
"climbing", "dancing", "clapping", "waving hand", "shaking hands",
"hugging", "kissing", "laughing", "crying", "talking", "singing",
"playing musical instrument", "cooking", "eating", "drinking",
"writing", "typing", "using phone", "using computer",
"driving car", "riding bicycle", "riding motorcycle", "riding horse",
"standing still", "sitting", "celebrating", "arguing", "fighting",
"falling down", "getting up", "stretching", "exercising", "unknown action",
]
FOUL_KEYWORDS = {"tackling", "wrestling", "headbutting", "punching", "kicking", "pushing", "slapping", "fighting"}
def load_model():
"""Load InternVideo2.5 model (lazy, called once)."""
global model, tokenizer, _is_fallback
if model is None:
try:
from transformers import AutoModel, AutoTokenizer
print(f"Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModel.from_pretrained(
MODEL_NAME,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
model.eval()
print("InternVideo2.5 loaded")
except Exception as e:
print(f"Failed to load InternVideo2.5: {e}. Falling back to VideoMAE.")
_load_fallback()
return model, tokenizer
def _load_fallback():
"""Load fallback VideoMAE model."""
global model, tokenizer, _is_fallback
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
fallback = "MCG-NJU/videomae-base-finetuned-kinetics"
print(f"Loading fallback: {fallback}")
tokenizer = VideoMAEImageProcessor.from_pretrained(fallback)
model = VideoMAEForVideoClassification.from_pretrained(fallback)
model.eval()
_is_fallback = True
def decode_base64_image(b64_str: str) -> np.ndarray:
"""Decode base64 string to numpy array."""
return np.array(Image.open(io.BytesIO(base64.b64decode(b64_str))).convert("RGB"))
@spaces.GPU(duration=180)
def _classify_gpu(frames: np.ndarray, action_candidates: list[str] | None = None) -> dict[str, Any]:
"""GPU-only: run action classification on frames."""
try:
load_model()
model.to("cuda")
if _is_fallback:
return _classify_fallback_gpu(frames)
candidates = action_candidates or DEFAULT_ACTION_CANDIDATES
if len(frames) > 8:
indices = np.linspace(0, len(frames) - 1, 8, dtype=int)
frames = frames[indices]
pil_frames = [Image.fromarray(f) for f in frames]
action_list = ", ".join(candidates[:20])
prompt = f"What action is being performed in this video? Choose from: {action_list}. Respond with just the action name."
try:
with torch.no_grad():
response = model.chat(
tokenizer=tokenizer,
pixel_values=pil_frames,
question=prompt,
generation_config={"max_new_tokens": 50, "do_sample": False},
)
response_lower = response.lower()
best_match = "unknown action"
best_score = 0.0
for candidate in candidates:
if candidate.lower() in response_lower:
best_match = candidate
best_score = 0.9
break
is_foul = any(kw in best_match.lower() for kw in FOUL_KEYWORDS)
return {
"action": best_match,
"confidence": best_score,
"is_foul_related": is_foul,
"top_5_predictions": [{"action": best_match, "confidence": best_score}],
"model": MODEL_NAME,
"raw_response": response,
}
except Exception as e:
print(f"InternVideo2.5 inference failed: {e}")
return _classify_fallback_gpu(frames)
finally:
model.cpu()
gc.collect()
torch.cuda.empty_cache()
def _classify_fallback_gpu(frames: np.ndarray) -> dict[str, Any]:
"""Fallback classification using VideoMAE (called while already on GPU)."""
if len(frames) != 16:
indices = np.linspace(0, len(frames) - 1, 16, dtype=int)
frames = frames[indices]
frames_list = [frames[i] for i in range(16)]
inputs = tokenizer(frames_list, return_tensors="pt")
inputs = {k: v.to("cuda") for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
top_prob, top_idx = torch.max(probs, dim=0)
action = model.config.id2label[int(top_idx)]
top5_probs, top5_indices = torch.topk(probs, k=min(5, len(probs)))
top5 = [{"action": model.config.id2label[int(i)], "confidence": float(p)} for p, i in zip(top5_probs, top5_indices)]
foul_classes = {"tackling", "wrestling", "headbutting", "punching person (boxing)"}
return {
"action": action,
"confidence": float(top_prob),
"class_id": int(top_idx),
"is_foul_related": action in foul_classes,
"top_5_predictions": top5,
"model": "MCG-NJU/videomae-base-finetuned-kinetics",
}
def demo_classify_video(video_file, custom_actions: str) -> str:
"""Demo function for video classification (UI)."""
try:
if video_file is None:
return "Please upload a video file."
if isinstance(video_file, str):
video_path = video_file
elif hasattr(video_file, "name"):
video_path = video_file.name
else:
return f"Error: Unexpected file type: {type(video_file)}"
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
return "Failed to open video file."
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
if total < 4:
cap.release()
return f"Video too short: {total} frames"
indices = np.linspace(0, total - 1, min(16, total), dtype=int)
frames = []
for idx in indices:
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame = cap.read()
if ret:
frames.append(cv2.resize(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), (224, 224)))
cap.release()
if len(frames) < 4:
return f"Could not extract enough frames (got {len(frames)})"
custom = [a.strip() for a in custom_actions.split(",")] if custom_actions.strip() else None
result = _classify_gpu(np.stack(frames), custom)
output = f"**Predicted Action:** {result['action']}\n"
output += f"**Confidence:** {result['confidence']:.2%}\n"
output += f"**Foul-Related:** {'Yes' if result['is_foul_related'] else 'No'}\n\n"
output += "**Top 5 Predictions:**\n"
for pred in result["top_5_predictions"]:
output += f"- {pred['action']}: {pred['confidence']:.2%}\n"
output += f"\n**Model:** {result['model']}"
return output
except Exception as e:
return f"Error: {str(e)}\n{traceback.format_exc()}"
def api_classify_action(
frames_base64: str | None = None,
timestamp_s: float = 0.0,
action_candidates: str | None = None,
) -> dict[str, Any]:
"""API endpoint for action classification from EagleEye."""
try:
frames_list = json.loads(frames_base64) if frames_base64 and frames_base64.startswith("[") else []
if not frames_list:
return {"success": False, "error": "No frames provided"}
frames = np.stack([decode_base64_image(b64) for b64 in frames_list])
candidates = None
if action_candidates and action_candidates.strip():
try:
if action_candidates.startswith("["):
candidates = json.loads(action_candidates)
else:
candidates = [c.strip() for c in action_candidates.split(",")]
except Exception:
candidates = None
result = _classify_gpu(frames, candidates)
return {
"success": True,
"action": result["action"],
"confidence": result["confidence"],
"is_foul_related": result["is_foul_related"],
"top_5_predictions": result["top_5_predictions"],
"timestamp_s": timestamp_s,
"model": result["model"],
}
except Exception as e:
return {"success": False, "error": str(e), "traceback": traceback.format_exc()}
with gr.Blocks(title="InternVideo2.5 for Cadayn") as demo:
gr.Markdown("""
# InternVideo2.5 - Action Recognition
Powered by [InternVideo2.5-8B](https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B) on ZeroGPU.
**SOTA Performance:**
- 92.1% accuracy on Kinetics-400 (+11.2% over VideoMAE)
- Open-vocabulary action detection
- Custom sports-specific actions
**API Endpoints for EagleEye:**
- `POST /call/api_classify_action` - Action classification
""")
with gr.Row(visible=False):
api_frames_input = gr.Textbox()
api_timestamp_input = gr.Number()
api_candidates_input = gr.Textbox()
api_result_output = gr.JSON()
api_frames_input.change(
fn=api_classify_action,
inputs=[api_frames_input, api_timestamp_input, api_candidates_input],
outputs=api_result_output,
api_name="api_classify_action",
)
with gr.Tab("Demo"):
with gr.Row():
with gr.Column():
video_input = gr.File(
label="Upload Video",
file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
)
custom_actions_input = gr.Textbox(
label="Custom Actions (optional)",
placeholder="running, jumping, dancing, ...",
lines=2,
)
classify_btn = gr.Button("Classify Action", variant="primary")
with gr.Column():
result_output = gr.Markdown(label="Result")
classify_btn.click(
fn=demo_classify_video,
inputs=[video_input, custom_actions_input],
outputs=result_output,
)
with gr.Tab("API"):
gr.Markdown("""
## API Usage for EagleEye Integration
### Action Classification
```python
from gradio_client import Client
import json
import base64
client = Client("magboola/internvideo2-zerogpu")
frames_b64 = [base64.b64encode(frame_bytes).decode() for frame_bytes in frames]
custom_actions = ["scoring a goal", "making a tackle", "celebrating"]
result = client.predict(
frames_base64=json.dumps(frames_b64),
timestamp_s=5.0,
action_candidates=json.dumps(custom_actions),
api_name="/api_classify_action"
)
print(result)
# {"success": True, "action": "scoring a goal", "confidence": 0.9, ...}
```
""")
if __name__ == "__main__":
demo.launch()