RaghavaMukkamala commited on
Commit
b9d859e
·
verified ·
1 Parent(s): 11b4d43

final updates

Browse files
Files changed (1) hide show
  1. app.py +114 -22
app.py CHANGED
@@ -1,49 +1,141 @@
1
- import gradio as gr
 
 
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
4
 
5
- from videollama3.conversation import conv_templates
6
- from videollama3.inference import chat
 
 
 
7
 
 
 
 
8
  MODEL_ID = "DAMO-NLP-SG/VideoLLaMA3-2B"
 
 
9
 
 
 
 
 
 
 
 
 
10
  tokenizer = AutoTokenizer.from_pretrained(
11
  MODEL_ID,
12
  trust_remote_code=True
13
  )
14
 
 
15
  model = AutoModelForCausalLM.from_pretrained(
16
  MODEL_ID,
17
  trust_remote_code=True,
18
- dtype=torch.float16,
19
  device_map="auto"
20
  )
21
-
22
  model.eval()
23
 
24
- def infer(video, prompt):
25
- conv = conv_templates["videollama3"].copy()
26
- conv.append_message(conv.roles[0], prompt)
27
- conv.append_message(conv.roles[1], None)
28
-
29
- output = chat(
30
- model=model,
31
- tokenizer=tokenizer,
32
- video_path=video,
33
- conversation=conv,
34
- max_new_tokens=512,
35
- temperature=0.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  )
37
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  demo = gr.Interface(
40
  fn=infer,
41
  inputs=[
42
  gr.Video(label="Upload video"),
43
- gr.Textbox(label="Prompt")
 
 
 
44
  ],
45
- outputs="text",
46
- title="🎥 Video-LLaMA-3 Demo"
 
47
  )
48
 
49
  demo.launch()
 
1
+ import os
2
+ os.environ["OMP_NUM_THREADS"] = "1"
3
+
4
  import torch
5
+ import gradio as gr
6
+ import cv2
7
+ import decord
8
+ import numpy as np
9
 
10
+ from transformers import (
11
+ AutoTokenizer,
12
+ AutoModelForCausalLM,
13
+ GenerationConfig,
14
+ )
15
 
16
+ # ------------------------
17
+ # Configuration
18
+ # ------------------------
19
  MODEL_ID = "DAMO-NLP-SG/VideoLLaMA3-2B"
20
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
+ DTYPE = torch.float16
22
 
23
+ MAX_FRAMES = 32 # reduce if you hit OOM
24
+ MAX_NEW_TOKENS = 512
25
+ TEMPERATURE = 0.2
26
+
27
+ # ------------------------
28
+ # Load model & tokenizer
29
+ # ------------------------
30
+ print("Loading tokenizer...")
31
  tokenizer = AutoTokenizer.from_pretrained(
32
  MODEL_ID,
33
  trust_remote_code=True
34
  )
35
 
36
+ print("Loading model...")
37
  model = AutoModelForCausalLM.from_pretrained(
38
  MODEL_ID,
39
  trust_remote_code=True,
40
+ torch_dtype=DTYPE,
41
  device_map="auto"
42
  )
 
43
  model.eval()
44
 
45
+ generation_config = GenerationConfig(
46
+ max_new_tokens=MAX_NEW_TOKENS,
47
+ temperature=TEMPERATURE,
48
+ do_sample=True,
49
+ )
50
+
51
+ # ------------------------
52
+ # Video utilities (from demo_video_llama3.py)
53
+ # ------------------------
54
+ def load_video(video_path, max_frames=32):
55
+ """
56
+ Load video and sample frames uniformly.
57
+ Returns: numpy array (T, H, W, C)
58
+ """
59
+ vr = decord.VideoReader(video_path, ctx=decord.cpu(0))
60
+ total_frames = len(vr)
61
+
62
+ if total_frames <= max_frames:
63
+ indices = list(range(total_frames))
64
+ else:
65
+ indices = np.linspace(
66
+ 0, total_frames - 1, max_frames, dtype=int
67
+ ).tolist()
68
+
69
+ frames = vr.get_batch(indices).asnumpy()
70
+ return frames
71
+
72
+
73
+ # ------------------------
74
+ # Inference
75
+ # ------------------------
76
+ def videollama3_infer(video_path, prompt):
77
+ if video_path is None:
78
+ return "Please upload a video."
79
+
80
+ # Load & sample video
81
+ frames = load_video(video_path, MAX_FRAMES)
82
+
83
+ # Build multimodal prompt (as in official demo)
84
+ system_prompt = (
85
+ "You are VideoLLaMA, a helpful assistant that understands videos."
86
+ )
87
+
88
+ full_prompt = (
89
+ f"<|system|>\n{system_prompt}\n"
90
+ f"<|user|>\n{prompt}\n"
91
+ f"<|assistant|>\n"
92
  )
93
+
94
+ inputs = tokenizer(
95
+ full_prompt,
96
+ return_tensors="pt"
97
+ ).to(model.device)
98
+
99
+ with torch.no_grad():
100
+ outputs = model.generate(
101
+ **inputs,
102
+ generation_config=generation_config,
103
+ videos=torch.tensor(frames).to(model.device)
104
+ )
105
+
106
+ response = tokenizer.decode(
107
+ outputs[0],
108
+ skip_special_tokens=True
109
+ )
110
+
111
+ # Strip prompt echo
112
+ return response.split("<|assistant|>")[-1].strip()
113
+
114
+
115
+ # ------------------------
116
+ # Gradio UI
117
+ # ------------------------
118
+ def infer(video, prompt):
119
+ try:
120
+ return videollama3_infer(video, prompt)
121
+ except RuntimeError as e:
122
+ if "out of memory" in str(e).lower():
123
+ return "⚠️ CUDA out of memory. Try a shorter video."
124
+ raise e
125
+
126
 
127
  demo = gr.Interface(
128
  fn=infer,
129
  inputs=[
130
  gr.Video(label="Upload video"),
131
+ gr.Textbox(
132
+ label="Prompt",
133
+ placeholder="Describe what happens in the video"
134
+ ),
135
  ],
136
+ outputs=gr.Textbox(label="Model output"),
137
+ title="🎥 VideoLLaMA-3 Demo",
138
+ description="Ask questions about short videos using VideoLLaMA-3",
139
  )
140
 
141
  demo.launch()