File size: 2,780 Bytes
2c1a174 7be3fec 2c1a174 13c87cd 7e74984 13c87cd 2c1a174 d92c416 e296327 d92c416 7be3fec 2c1a174 7be3fec 13c87cd b477b53 13c87cd 7e74984 13c87cd b477b53 d92c416 e296327 e8b3256 e296327 6b6719d d92c416 b477b53 d92c416 b477b53 7e74984 d92c416 b477b53 d92c416 3d7a0d1 d92c416 b477b53 3e5f43f b477b53 7be3fec d92c416 e296327 d92c416 7be3fec 2c1a174 7be3fec 2c1a174 7e74984 2c1a174 e296327 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
import json
import torch
import decord
from transformers import (
AutoProcessor,
AutoModelForVideoClassification,
AutoTokenizer,
AutoModelForCausalLM,
)
# ======================
# 1. VideoMAE
# ======================
vm_model_id = "MCG-NJU/videomae-base-finetuned-kinetics"
vm_processor = AutoProcessor.from_pretrained(vm_model_id)
vm_model = AutoModelForVideoClassification.from_pretrained(vm_model_id)
def run_videomae(video):
try:
vr = decord.VideoReader(video, num_threads=1)
frames = [vr[i].asnumpy() for i in range(0, len(vr), max(1, len(vr)//16))]
inputs = vm_processor(images=frames, return_tensors="pt")
with torch.no_grad():
outputs = vm_model(**inputs)
pred_id = outputs.logits.argmax(-1).item()
return {
"model": "VideoMAE",
"status": "ok",
"class": vm_model.config.id2label[pred_id],
"confidence": float(torch.softmax(outputs.logits, -1)[0, pred_id].item()),
}
except Exception as e:
return {"model": "VideoMAE", "status": "failed", "error": str(e)}
# ======================
# 2. LLaVA-Video-Llama-3.1-8B
# ======================
try:
llava_model_id = "weizhiwang/LLaVA-Video-Llama-3.1-8B"
llava_tokenizer = AutoTokenizer.from_pretrained(llava_model_id, trust_remote_code=True)
llava_model = AutoModelForCausalLM.from_pretrained(
llava_model_id, trust_remote_code=True
).half().cuda().eval()
def run_llava(video, prompt):
try:
inputs = llava_tokenizer(prompt, return_tensors="pt").to("cuda")
output = llava_model.generate(**inputs, max_new_tokens=256)
return {
"model": "LLaVA-Video-Llama-3.1-8B",
"status": "ok",
"output": llava_tokenizer.decode(output[0], skip_special_tokens=True),
}
except Exception as e:
return {"model": "LLaVA-Video-Llama-3.1-8B", "status": "failed", "error": str(e)}
except Exception as outer_error:
llava_load_error = str(outer_error)
def run_llava(video, prompt):
return {
"model": "LLaVA-Video-Llama-3.1-8B",
"status": "failed",
"error": f"LLaVA not available (requires bleeding-edge Transformers). Details: {llava_load_error}",
}
# ======================
# Unified App
# ======================
def analyze_all(video, prompt):
results = []
results.append(run_videomae(video))
results.append(run_llava(video, prompt))
return json.dumps(results, indent=2)
demo = gr.Interface(
fn=analyze_all,
inputs=[gr.Video(label="Upload Video"), gr.Textbox(label="Prompt")],
outputs="json",
)
if __name__ == "__main__":
demo.launch() |