Spaces:
Running on Zero
Running on Zero
| """ | |
| InternVideo2.5 ZeroGPU HuggingFace Space | |
| Provides action recognition API for EagleEye video analysis. | |
| Uses InternVideo2.5-8B for SOTA open-vocabulary action detection (92.1% on Kinetics-400). | |
| """ | |
| from __future__ import annotations | |
| import base64 | |
| import gc | |
| import io | |
| import json | |
| import os | |
| import traceback | |
| from typing import Any | |
| import cv2 | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from PIL import Image | |
| try: | |
| import spaces | |
| ZEROGPU_AVAILABLE = True | |
| except ImportError: | |
| ZEROGPU_AVAILABLE = False | |
| class _spaces_stub: | |
| def GPU(duration=60): | |
| def decorator(func): | |
| return func | |
| return decorator | |
| spaces = _spaces_stub() | |
| MODEL_NAME = "OpenGVLab/InternVideo2_5_Chat_8B" | |
| model = None | |
| tokenizer = None | |
| _is_fallback = False | |
| DEFAULT_ACTION_CANDIDATES = [ | |
| "tackling", "wrestling", "headbutting", "punching", "kicking", | |
| "pushing", "slapping", "sword fighting", "shooting goal (soccer)", | |
| "dribbling basketball", "passing soccer ball", "catching or throwing baseball", | |
| "shooting basketball", "dunking basketball", "heading ball", | |
| "volleyball spiking", "playing tennis", "playing badminton", | |
| "running", "jogging", "walking", "jumping", "diving", "swimming", | |
| "climbing", "dancing", "clapping", "waving hand", "shaking hands", | |
| "hugging", "kissing", "laughing", "crying", "talking", "singing", | |
| "playing musical instrument", "cooking", "eating", "drinking", | |
| "writing", "typing", "using phone", "using computer", | |
| "driving car", "riding bicycle", "riding motorcycle", "riding horse", | |
| "standing still", "sitting", "celebrating", "arguing", "fighting", | |
| "falling down", "getting up", "stretching", "exercising", "unknown action", | |
| ] | |
| FOUL_KEYWORDS = {"tackling", "wrestling", "headbutting", "punching", "kicking", "pushing", "slapping", "fighting"} | |
| def load_model(): | |
| """Load InternVideo2.5 model (lazy, called once).""" | |
| global model, tokenizer, _is_fallback | |
| if model is None: | |
| try: | |
| from transformers import AutoModel, AutoTokenizer | |
| print(f"Loading {MODEL_NAME}...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) | |
| model = AutoModel.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=True, | |
| ) | |
| model.eval() | |
| print("InternVideo2.5 loaded") | |
| except Exception as e: | |
| print(f"Failed to load InternVideo2.5: {e}. Falling back to VideoMAE.") | |
| _load_fallback() | |
| return model, tokenizer | |
| def _load_fallback(): | |
| """Load fallback VideoMAE model.""" | |
| global model, tokenizer, _is_fallback | |
| from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor | |
| fallback = "MCG-NJU/videomae-base-finetuned-kinetics" | |
| print(f"Loading fallback: {fallback}") | |
| tokenizer = VideoMAEImageProcessor.from_pretrained(fallback) | |
| model = VideoMAEForVideoClassification.from_pretrained(fallback) | |
| model.eval() | |
| _is_fallback = True | |
| def decode_base64_image(b64_str: str) -> np.ndarray: | |
| """Decode base64 string to numpy array.""" | |
| return np.array(Image.open(io.BytesIO(base64.b64decode(b64_str))).convert("RGB")) | |
| def _classify_gpu(frames: np.ndarray, action_candidates: list[str] | None = None) -> dict[str, Any]: | |
| """GPU-only: run action classification on frames.""" | |
| try: | |
| load_model() | |
| model.to("cuda") | |
| if _is_fallback: | |
| return _classify_fallback_gpu(frames) | |
| candidates = action_candidates or DEFAULT_ACTION_CANDIDATES | |
| if len(frames) > 8: | |
| indices = np.linspace(0, len(frames) - 1, 8, dtype=int) | |
| frames = frames[indices] | |
| pil_frames = [Image.fromarray(f) for f in frames] | |
| action_list = ", ".join(candidates[:20]) | |
| prompt = f"What action is being performed in this video? Choose from: {action_list}. Respond with just the action name." | |
| try: | |
| with torch.no_grad(): | |
| response = model.chat( | |
| tokenizer=tokenizer, | |
| pixel_values=pil_frames, | |
| question=prompt, | |
| generation_config={"max_new_tokens": 50, "do_sample": False}, | |
| ) | |
| response_lower = response.lower() | |
| best_match = "unknown action" | |
| best_score = 0.0 | |
| for candidate in candidates: | |
| if candidate.lower() in response_lower: | |
| best_match = candidate | |
| best_score = 0.9 | |
| break | |
| is_foul = any(kw in best_match.lower() for kw in FOUL_KEYWORDS) | |
| return { | |
| "action": best_match, | |
| "confidence": best_score, | |
| "is_foul_related": is_foul, | |
| "top_5_predictions": [{"action": best_match, "confidence": best_score}], | |
| "model": MODEL_NAME, | |
| "raw_response": response, | |
| } | |
| except Exception as e: | |
| print(f"InternVideo2.5 inference failed: {e}") | |
| return _classify_fallback_gpu(frames) | |
| finally: | |
| model.cpu() | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| def _classify_fallback_gpu(frames: np.ndarray) -> dict[str, Any]: | |
| """Fallback classification using VideoMAE (called while already on GPU).""" | |
| if len(frames) != 16: | |
| indices = np.linspace(0, len(frames) - 1, 16, dtype=int) | |
| frames = frames[indices] | |
| frames_list = [frames[i] for i in range(16)] | |
| inputs = tokenizer(frames_list, return_tensors="pt") | |
| inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0] | |
| top_prob, top_idx = torch.max(probs, dim=0) | |
| action = model.config.id2label[int(top_idx)] | |
| top5_probs, top5_indices = torch.topk(probs, k=min(5, len(probs))) | |
| top5 = [{"action": model.config.id2label[int(i)], "confidence": float(p)} for p, i in zip(top5_probs, top5_indices)] | |
| foul_classes = {"tackling", "wrestling", "headbutting", "punching person (boxing)"} | |
| return { | |
| "action": action, | |
| "confidence": float(top_prob), | |
| "class_id": int(top_idx), | |
| "is_foul_related": action in foul_classes, | |
| "top_5_predictions": top5, | |
| "model": "MCG-NJU/videomae-base-finetuned-kinetics", | |
| } | |
| def demo_classify_video(video_file, custom_actions: str) -> str: | |
| """Demo function for video classification (UI).""" | |
| try: | |
| if video_file is None: | |
| return "Please upload a video file." | |
| if isinstance(video_file, str): | |
| video_path = video_file | |
| elif hasattr(video_file, "name"): | |
| video_path = video_file.name | |
| else: | |
| return f"Error: Unexpected file type: {type(video_file)}" | |
| cap = cv2.VideoCapture(video_path) | |
| if not cap.isOpened(): | |
| return "Failed to open video file." | |
| total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| if total < 4: | |
| cap.release() | |
| return f"Video too short: {total} frames" | |
| indices = np.linspace(0, total - 1, min(16, total), dtype=int) | |
| frames = [] | |
| for idx in indices: | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, idx) | |
| ret, frame = cap.read() | |
| if ret: | |
| frames.append(cv2.resize(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), (224, 224))) | |
| cap.release() | |
| if len(frames) < 4: | |
| return f"Could not extract enough frames (got {len(frames)})" | |
| custom = [a.strip() for a in custom_actions.split(",")] if custom_actions.strip() else None | |
| result = _classify_gpu(np.stack(frames), custom) | |
| output = f"**Predicted Action:** {result['action']}\n" | |
| output += f"**Confidence:** {result['confidence']:.2%}\n" | |
| output += f"**Foul-Related:** {'Yes' if result['is_foul_related'] else 'No'}\n\n" | |
| output += "**Top 5 Predictions:**\n" | |
| for pred in result["top_5_predictions"]: | |
| output += f"- {pred['action']}: {pred['confidence']:.2%}\n" | |
| output += f"\n**Model:** {result['model']}" | |
| return output | |
| except Exception as e: | |
| return f"Error: {str(e)}\n{traceback.format_exc()}" | |
| def api_classify_action( | |
| frames_base64: str | None = None, | |
| timestamp_s: float = 0.0, | |
| action_candidates: str | None = None, | |
| ) -> dict[str, Any]: | |
| """API endpoint for action classification from EagleEye.""" | |
| try: | |
| frames_list = json.loads(frames_base64) if frames_base64 and frames_base64.startswith("[") else [] | |
| if not frames_list: | |
| return {"success": False, "error": "No frames provided"} | |
| frames = np.stack([decode_base64_image(b64) for b64 in frames_list]) | |
| candidates = None | |
| if action_candidates and action_candidates.strip(): | |
| try: | |
| if action_candidates.startswith("["): | |
| candidates = json.loads(action_candidates) | |
| else: | |
| candidates = [c.strip() for c in action_candidates.split(",")] | |
| except Exception: | |
| candidates = None | |
| result = _classify_gpu(frames, candidates) | |
| return { | |
| "success": True, | |
| "action": result["action"], | |
| "confidence": result["confidence"], | |
| "is_foul_related": result["is_foul_related"], | |
| "top_5_predictions": result["top_5_predictions"], | |
| "timestamp_s": timestamp_s, | |
| "model": result["model"], | |
| } | |
| except Exception as e: | |
| return {"success": False, "error": str(e), "traceback": traceback.format_exc()} | |
| with gr.Blocks(title="InternVideo2.5 for Cadayn") as demo: | |
| gr.Markdown(""" | |
| # InternVideo2.5 - Action Recognition | |
| Powered by [InternVideo2.5-8B](https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B) on ZeroGPU. | |
| **SOTA Performance:** | |
| - 92.1% accuracy on Kinetics-400 (+11.2% over VideoMAE) | |
| - Open-vocabulary action detection | |
| - Custom sports-specific actions | |
| **API Endpoints for EagleEye:** | |
| - `POST /call/api_classify_action` - Action classification | |
| """) | |
| with gr.Row(visible=False): | |
| api_frames_input = gr.Textbox() | |
| api_timestamp_input = gr.Number() | |
| api_candidates_input = gr.Textbox() | |
| api_result_output = gr.JSON() | |
| api_frames_input.change( | |
| fn=api_classify_action, | |
| inputs=[api_frames_input, api_timestamp_input, api_candidates_input], | |
| outputs=api_result_output, | |
| api_name="api_classify_action", | |
| ) | |
| with gr.Tab("Demo"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| video_input = gr.File( | |
| label="Upload Video", | |
| file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"], | |
| ) | |
| custom_actions_input = gr.Textbox( | |
| label="Custom Actions (optional)", | |
| placeholder="running, jumping, dancing, ...", | |
| lines=2, | |
| ) | |
| classify_btn = gr.Button("Classify Action", variant="primary") | |
| with gr.Column(): | |
| result_output = gr.Markdown(label="Result") | |
| classify_btn.click( | |
| fn=demo_classify_video, | |
| inputs=[video_input, custom_actions_input], | |
| outputs=result_output, | |
| ) | |
| with gr.Tab("API"): | |
| gr.Markdown(""" | |
| ## API Usage for EagleEye Integration | |
| ### Action Classification | |
| ```python | |
| from gradio_client import Client | |
| import json | |
| import base64 | |
| client = Client("magboola/internvideo2-zerogpu") | |
| frames_b64 = [base64.b64encode(frame_bytes).decode() for frame_bytes in frames] | |
| custom_actions = ["scoring a goal", "making a tackle", "celebrating"] | |
| result = client.predict( | |
| frames_base64=json.dumps(frames_b64), | |
| timestamp_s=5.0, | |
| action_candidates=json.dumps(custom_actions), | |
| api_name="/api_classify_action" | |
| ) | |
| print(result) | |
| # {"success": True, "action": "scoring a goal", "confidence": 0.9, ...} | |
| ``` | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |