File size: 2,780 Bytes
2c1a174
7be3fec
2c1a174
 
13c87cd
 
 
 
7e74984
13c87cd
2c1a174
d92c416
e296327
d92c416
7be3fec
 
 
2c1a174
7be3fec
13c87cd
 
 
 
 
 
 
 
 
b477b53
13c87cd
7e74984
13c87cd
 
b477b53
d92c416
e296327
e8b3256
e296327
6b6719d
d92c416
 
b477b53
 
 
d92c416
 
 
 
 
 
 
b477b53
7e74984
d92c416
 
b477b53
d92c416
3d7a0d1
 
 
d92c416
b477b53
 
 
3e5f43f
b477b53
7be3fec
d92c416
e296327
d92c416
7be3fec
 
 
 
 
 
2c1a174
7be3fec
2c1a174
7e74984
2c1a174
 
 
e296327
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import json
import torch
import decord
from transformers import (
    AutoProcessor,
    AutoModelForVideoClassification,
    AutoTokenizer,
    AutoModelForCausalLM,
)

# ======================
# 1. VideoMAE
# ======================
vm_model_id = "MCG-NJU/videomae-base-finetuned-kinetics"
vm_processor = AutoProcessor.from_pretrained(vm_model_id)
vm_model = AutoModelForVideoClassification.from_pretrained(vm_model_id)

def run_videomae(video):
    try:
        vr = decord.VideoReader(video, num_threads=1)
        frames = [vr[i].asnumpy() for i in range(0, len(vr), max(1, len(vr)//16))]
        inputs = vm_processor(images=frames, return_tensors="pt")
        with torch.no_grad():
            outputs = vm_model(**inputs)
        pred_id = outputs.logits.argmax(-1).item()
        return {
            "model": "VideoMAE",
            "status": "ok",
            "class": vm_model.config.id2label[pred_id],
            "confidence": float(torch.softmax(outputs.logits, -1)[0, pred_id].item()),
        }
    except Exception as e:
        return {"model": "VideoMAE", "status": "failed", "error": str(e)}

# ======================
# 2. LLaVA-Video-Llama-3.1-8B
# ======================
try:
    llava_model_id = "weizhiwang/LLaVA-Video-Llama-3.1-8B"
    llava_tokenizer = AutoTokenizer.from_pretrained(llava_model_id, trust_remote_code=True)
    llava_model = AutoModelForCausalLM.from_pretrained(
        llava_model_id, trust_remote_code=True
    ).half().cuda().eval()

    def run_llava(video, prompt):
        try:
            inputs = llava_tokenizer(prompt, return_tensors="pt").to("cuda")
            output = llava_model.generate(**inputs, max_new_tokens=256)
            return {
                "model": "LLaVA-Video-Llama-3.1-8B",
                "status": "ok",
                "output": llava_tokenizer.decode(output[0], skip_special_tokens=True),
            }
        except Exception as e:
            return {"model": "LLaVA-Video-Llama-3.1-8B", "status": "failed", "error": str(e)}

except Exception as outer_error:
    llava_load_error = str(outer_error)

    def run_llava(video, prompt):
        return {
            "model": "LLaVA-Video-Llama-3.1-8B",
            "status": "failed",
            "error": f"LLaVA not available (requires bleeding-edge Transformers). Details: {llava_load_error}",
        }

# ======================
# Unified App
# ======================
def analyze_all(video, prompt):
    results = []
    results.append(run_videomae(video))
    results.append(run_llava(video, prompt))
    return json.dumps(results, indent=2)

demo = gr.Interface(
    fn=analyze_all,
    inputs=[gr.Video(label="Upload Video"), gr.Textbox(label="Prompt")],
    outputs="json",
)

if __name__ == "__main__":
    demo.launch()