Spaces:

magboola
/

internvideo2-zerogpu

Running on Zero

App Files Files Community

internvideo2-zerogpu / app.py

magboola

fix: hardcode cuda in GPU functions for ZeroGPU compatibility

69554dd verified about 1 month ago

raw

history blame contribute delete

12.4 kB

	"""
	InternVideo2.5 ZeroGPU HuggingFace Space

	Provides action recognition API for EagleEye video analysis.
	Uses InternVideo2.5-8B for SOTA open-vocabulary action detection (92.1% on Kinetics-400).
	"""

	from __future__ import annotations

	import base64
	import gc
	import io
	import json
	import os
	import traceback
	from typing import Any

	import cv2
	import gradio as gr
	import numpy as np
	import torch
	from PIL import Image

	try:
	import spaces
	ZEROGPU_AVAILABLE = True
	except ImportError:
	ZEROGPU_AVAILABLE = False

	class _spaces_stub:
	@staticmethod
	def GPU(duration=60):
	def decorator(func):
	return func
	return decorator

	spaces = _spaces_stub()

	MODEL_NAME = "OpenGVLab/InternVideo2_5_Chat_8B"

	model = None
	tokenizer = None
	_is_fallback = False

	DEFAULT_ACTION_CANDIDATES = [
	"tackling", "wrestling", "headbutting", "punching", "kicking",
	"pushing", "slapping", "sword fighting", "shooting goal (soccer)",
	"dribbling basketball", "passing soccer ball", "catching or throwing baseball",
	"shooting basketball", "dunking basketball", "heading ball",
	"volleyball spiking", "playing tennis", "playing badminton",
	"running", "jogging", "walking", "jumping", "diving", "swimming",
	"climbing", "dancing", "clapping", "waving hand", "shaking hands",
	"hugging", "kissing", "laughing", "crying", "talking", "singing",
	"playing musical instrument", "cooking", "eating", "drinking",
	"writing", "typing", "using phone", "using computer",
	"driving car", "riding bicycle", "riding motorcycle", "riding horse",
	"standing still", "sitting", "celebrating", "arguing", "fighting",
	"falling down", "getting up", "stretching", "exercising", "unknown action",
	]

	FOUL_KEYWORDS = {"tackling", "wrestling", "headbutting", "punching", "kicking", "pushing", "slapping", "fighting"}


	def load_model():
	"""Load InternVideo2.5 model (lazy, called once)."""
	global model, tokenizer, _is_fallback

	if model is None:
	try:
	from transformers import AutoModel, AutoTokenizer
	print(f"Loading {MODEL_NAME}...")

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
	model = AutoModel.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.bfloat16,
	trust_remote_code=True,
	)
	model.eval()
	print("InternVideo2.5 loaded")
	except Exception as e:
	print(f"Failed to load InternVideo2.5: {e}. Falling back to VideoMAE.")
	_load_fallback()

	return model, tokenizer


	def _load_fallback():
	"""Load fallback VideoMAE model."""
	global model, tokenizer, _is_fallback
	from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor

	fallback = "MCG-NJU/videomae-base-finetuned-kinetics"
	print(f"Loading fallback: {fallback}")
	tokenizer = VideoMAEImageProcessor.from_pretrained(fallback)
	model = VideoMAEForVideoClassification.from_pretrained(fallback)
	model.eval()
	_is_fallback = True


	def decode_base64_image(b64_str: str) -> np.ndarray:
	"""Decode base64 string to numpy array."""
	return np.array(Image.open(io.BytesIO(base64.b64decode(b64_str))).convert("RGB"))


	@spaces.GPU(duration=180)
	def _classify_gpu(frames: np.ndarray, action_candidates: list[str] \| None = None) -> dict[str, Any]:
	"""GPU-only: run action classification on frames."""
	try:
	load_model()
	model.to("cuda")

	if _is_fallback:
	return _classify_fallback_gpu(frames)

	candidates = action_candidates or DEFAULT_ACTION_CANDIDATES

	if len(frames) > 8:
	indices = np.linspace(0, len(frames) - 1, 8, dtype=int)
	frames = frames[indices]

	pil_frames = [Image.fromarray(f) for f in frames]
	action_list = ", ".join(candidates[:20])
	prompt = f"What action is being performed in this video? Choose from: {action_list}. Respond with just the action name."

	try:
	with torch.no_grad():
	response = model.chat(
	tokenizer=tokenizer,
	pixel_values=pil_frames,
	question=prompt,
	generation_config={"max_new_tokens": 50, "do_sample": False},
	)

	response_lower = response.lower()
	best_match = "unknown action"
	best_score = 0.0

	for candidate in candidates:
	if candidate.lower() in response_lower:
	best_match = candidate
	best_score = 0.9
	break

	is_foul = any(kw in best_match.lower() for kw in FOUL_KEYWORDS)

	return {
	"action": best_match,
	"confidence": best_score,
	"is_foul_related": is_foul,
	"top_5_predictions": [{"action": best_match, "confidence": best_score}],
	"model": MODEL_NAME,
	"raw_response": response,
	}

	except Exception as e:
	print(f"InternVideo2.5 inference failed: {e}")
	return _classify_fallback_gpu(frames)

	finally:
	model.cpu()
	gc.collect()
	torch.cuda.empty_cache()


	def _classify_fallback_gpu(frames: np.ndarray) -> dict[str, Any]:
	"""Fallback classification using VideoMAE (called while already on GPU)."""
	if len(frames) != 16:
	indices = np.linspace(0, len(frames) - 1, 16, dtype=int)
	frames = frames[indices]

	frames_list = [frames[i] for i in range(16)]
	inputs = tokenizer(frames_list, return_tensors="pt")
	inputs = {k: v.to("cuda") for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model(**inputs)
	probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]

	top_prob, top_idx = torch.max(probs, dim=0)
	action = model.config.id2label[int(top_idx)]

	top5_probs, top5_indices = torch.topk(probs, k=min(5, len(probs)))
	top5 = [{"action": model.config.id2label[int(i)], "confidence": float(p)} for p, i in zip(top5_probs, top5_indices)]

	foul_classes = {"tackling", "wrestling", "headbutting", "punching person (boxing)"}

	return {
	"action": action,
	"confidence": float(top_prob),
	"class_id": int(top_idx),
	"is_foul_related": action in foul_classes,
	"top_5_predictions": top5,
	"model": "MCG-NJU/videomae-base-finetuned-kinetics",
	}


	def demo_classify_video(video_file, custom_actions: str) -> str:
	"""Demo function for video classification (UI)."""
	try:
	if video_file is None:
	return "Please upload a video file."

	if isinstance(video_file, str):
	video_path = video_file
	elif hasattr(video_file, "name"):
	video_path = video_file.name
	else:
	return f"Error: Unexpected file type: {type(video_file)}"

	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	return "Failed to open video file."

	total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	if total < 4:
	cap.release()
	return f"Video too short: {total} frames"

	indices = np.linspace(0, total - 1, min(16, total), dtype=int)
	frames = []
	for idx in indices:
	cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
	ret, frame = cap.read()
	if ret:
	frames.append(cv2.resize(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), (224, 224)))
	cap.release()

	if len(frames) < 4:
	return f"Could not extract enough frames (got {len(frames)})"

	custom = [a.strip() for a in custom_actions.split(",")] if custom_actions.strip() else None
	result = _classify_gpu(np.stack(frames), custom)

	output = f"Predicted Action: {result['action']}\n"
	output += f"Confidence: {result['confidence']:.2%}\n"
	output += f"Foul-Related: {'Yes' if result['is_foul_related'] else 'No'}\n\n"
	output += "Top 5 Predictions:\n"
	for pred in result["top_5_predictions"]:
	output += f"- {pred['action']}: {pred['confidence']:.2%}\n"
	output += f"\nModel: {result['model']}"

	return output

	except Exception as e:
	return f"Error: {str(e)}\n{traceback.format_exc()}"


	def api_classify_action(
	frames_base64: str \| None = None,
	timestamp_s: float = 0.0,
	action_candidates: str \| None = None,
	) -> dict[str, Any]:
	"""API endpoint for action classification from EagleEye."""
	try:
	frames_list = json.loads(frames_base64) if frames_base64 and frames_base64.startswith("[") else []
	if not frames_list:
	return {"success": False, "error": "No frames provided"}

	frames = np.stack([decode_base64_image(b64) for b64 in frames_list])

	candidates = None
	if action_candidates and action_candidates.strip():
	try:
	if action_candidates.startswith("["):
	candidates = json.loads(action_candidates)
	else:
	candidates = [c.strip() for c in action_candidates.split(",")]
	except Exception:
	candidates = None

	result = _classify_gpu(frames, candidates)

	return {
	"success": True,
	"action": result["action"],
	"confidence": result["confidence"],
	"is_foul_related": result["is_foul_related"],
	"top_5_predictions": result["top_5_predictions"],
	"timestamp_s": timestamp_s,
	"model": result["model"],
	}

	except Exception as e:
	return {"success": False, "error": str(e), "traceback": traceback.format_exc()}


	with gr.Blocks(title="InternVideo2.5 for Cadayn") as demo:
	gr.Markdown("""
	# InternVideo2.5 - Action Recognition

	Powered by [InternVideo2.5-8B](https://huggingface.co/OpenGVLab/InternVideo2_5_Chat_8B) on ZeroGPU.

	SOTA Performance:
	- 92.1% accuracy on Kinetics-400 (+11.2% over VideoMAE)
	- Open-vocabulary action detection
	- Custom sports-specific actions

	API Endpoints for EagleEye:
	- `POST /call/api_classify_action` - Action classification
	""")

	with gr.Row(visible=False):
	api_frames_input = gr.Textbox()
	api_timestamp_input = gr.Number()
	api_candidates_input = gr.Textbox()
	api_result_output = gr.JSON()

	api_frames_input.change(
	fn=api_classify_action,
	inputs=[api_frames_input, api_timestamp_input, api_candidates_input],
	outputs=api_result_output,
	api_name="api_classify_action",
	)

	with gr.Tab("Demo"):
	with gr.Row():
	with gr.Column():
	video_input = gr.File(
	label="Upload Video",
	file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"],
	)
	custom_actions_input = gr.Textbox(
	label="Custom Actions (optional)",
	placeholder="running, jumping, dancing, ...",
	lines=2,
	)
	classify_btn = gr.Button("Classify Action", variant="primary")

	with gr.Column():
	result_output = gr.Markdown(label="Result")

	classify_btn.click(
	fn=demo_classify_video,
	inputs=[video_input, custom_actions_input],
	outputs=result_output,
	)

	with gr.Tab("API"):
	gr.Markdown("""
	## API Usage for EagleEye Integration

	### Action Classification
	```python
	from gradio_client import Client
	import json
	import base64

	client = Client("magboola/internvideo2-zerogpu")

	frames_b64 = [base64.b64encode(frame_bytes).decode() for frame_bytes in frames]

	custom_actions = ["scoring a goal", "making a tackle", "celebrating"]

	result = client.predict(
	frames_base64=json.dumps(frames_b64),
	timestamp_s=5.0,
	action_candidates=json.dumps(custom_actions),
	api_name="/api_classify_action"
	)
	print(result)
	# {"success": True, "action": "scoring a goal", "confidence": 0.9, ...}
	```
	""")


	if __name__ == "__main__":
	demo.launch()