akhaliq HF Staff commited on
Commit
77c5fd0
Β·
verified Β·
1 Parent(s): 62f3901

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +270 -0
app.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ from transformers import AutoModel, AutoTokenizer
5
+ from decord import VideoReader, cpu
6
+ from scipy.spatial import cKDTree
7
+ import numpy as np
8
+ import math
9
+ import time
10
+
11
+ # Model initialization
12
+ model = None
13
+ tokenizer = None
14
+
15
+ MAX_NUM_FRAMES = 180
16
+ MAX_NUM_PACKING = 3
17
+ TIME_SCALE = 0.1
18
+
19
+ def load_model():
20
+ global model, tokenizer
21
+ if model is None:
22
+ gr.Info("Loading model... This may take a moment.")
23
+ model = AutoModel.from_pretrained(
24
+ 'openbmb/MiniCPM-V-4_5',
25
+ trust_remote_code=True,
26
+ attn_implementation='sdpa',
27
+ torch_dtype=torch.bfloat16
28
+ )
29
+ model = model.eval().cuda()
30
+ tokenizer = AutoTokenizer.from_pretrained(
31
+ 'openbmb/MiniCPM-V-4_5',
32
+ trust_remote_code=True
33
+ )
34
+ gr.Success("Model loaded successfully!")
35
+ return model, tokenizer
36
+
37
+ def map_to_nearest_scale(values, scale):
38
+ tree = cKDTree(np.asarray(scale)[:, None])
39
+ _, indices = tree.query(np.asarray(values)[:, None])
40
+ return np.asarray(scale)[indices]
41
+
42
+ def group_array(arr, size):
43
+ return [arr[i:i+size] for i in range(0, len(arr), size)]
44
+
45
+ def encode_video(video_path, choose_fps=3, force_packing=None):
46
+ def uniform_sample(l, n):
47
+ gap = len(l) / n
48
+ idxs = [int(i * gap + gap / 2) for i in range(n)]
49
+ return [l[i] for i in idxs]
50
+
51
+ vr = VideoReader(video_path, ctx=cpu(0))
52
+ fps = vr.get_avg_fps()
53
+ video_duration = len(vr) / fps
54
+
55
+ if choose_fps * int(video_duration) <= MAX_NUM_FRAMES:
56
+ packing_nums = 1
57
+ choose_frames = round(min(choose_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
58
+ else:
59
+ packing_nums = math.ceil(video_duration * choose_fps / MAX_NUM_FRAMES)
60
+ if packing_nums <= MAX_NUM_PACKING:
61
+ choose_frames = round(video_duration * choose_fps)
62
+ else:
63
+ choose_frames = round(MAX_NUM_FRAMES * MAX_NUM_PACKING)
64
+ packing_nums = MAX_NUM_PACKING
65
+
66
+ frame_idx = [i for i in range(0, len(vr))]
67
+ frame_idx = np.array(uniform_sample(frame_idx, choose_frames))
68
+
69
+ if force_packing:
70
+ packing_nums = min(force_packing, MAX_NUM_PACKING)
71
+
72
+ frames = vr.get_batch(frame_idx).asnumpy()
73
+
74
+ frame_idx_ts = frame_idx / fps
75
+ scale = np.arange(0, video_duration, TIME_SCALE)
76
+
77
+ frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / TIME_SCALE
78
+ frame_ts_id = frame_ts_id.astype(np.int32)
79
+
80
+ assert len(frames) == len(frame_ts_id)
81
+
82
+ frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
83
+ frame_ts_id_group = group_array(frame_ts_id, packing_nums)
84
+
85
+ return frames, frame_ts_id_group, video_duration, len(frame_idx), packing_nums
86
+
87
+ def process_video_and_question(video, question, fps, force_packing, history):
88
+ if video is None:
89
+ gr.Warning("Please upload a video first.")
90
+ return history, ""
91
+
92
+ if not question:
93
+ gr.Warning("Please enter a question.")
94
+ return history, ""
95
+
96
+ try:
97
+ # Load model if not already loaded
98
+ model, tokenizer = load_model()
99
+
100
+ # Encode video
101
+ gr.Info(f"Processing video with {fps} FPS...")
102
+ frames, frame_ts_id_group, duration, num_frames, packing_nums = encode_video(
103
+ video,
104
+ fps,
105
+ force_packing=force_packing if force_packing > 0 else None
106
+ )
107
+
108
+ # Prepare messages
109
+ msgs = [
110
+ {'role': 'user', 'content': frames + [question]},
111
+ ]
112
+
113
+ # Get model response
114
+ gr.Info("Generating response...")
115
+ answer = model.chat(
116
+ msgs=msgs,
117
+ tokenizer=tokenizer,
118
+ use_image_id=False,
119
+ max_slice_nums=1,
120
+ temporal_ids=frame_ts_id_group
121
+ )
122
+
123
+ # Update chat history
124
+ history.append({
125
+ "role": "user",
126
+ "content": f"πŸ“Ή [Video: {duration:.1f}s, {num_frames} frames, packing: {packing_nums}]\n{question}"
127
+ })
128
+ history.append({
129
+ "role": "assistant",
130
+ "content": answer
131
+ })
132
+
133
+ return history, ""
134
+
135
+ except Exception as e:
136
+ gr.Error(f"Error processing video: {str(e)}")
137
+ return history, ""
138
+
139
+ def clear_chat():
140
+ return [], None, "", 3, 0
141
+
142
+ # CSS for better styling
143
+ css = """
144
+ .chat-container {
145
+ overflow-y: auto;
146
+ }
147
+ """
148
+
149
+ # Create Gradio interface
150
+ with gr.Blocks(css=css, title="Video Chat with MiniCPM-V") as demo:
151
+ gr.Markdown(
152
+ """
153
+ # πŸŽ₯ Video Chat with MiniCPM-V-4.5
154
+
155
+ Upload a video and ask questions about it! The model uses advanced 3D-resampler compression
156
+ to process multiple frames efficiently.
157
+
158
+ **Note:** First run will download the model (~8GB), which may take a few minutes.
159
+ """
160
+ )
161
+
162
+ with gr.Row():
163
+ # Main video area (takes most of the space)
164
+ with gr.Column(scale=3):
165
+ video_input = gr.Video(
166
+ label="Upload Video",
167
+ height=600
168
+ )
169
+
170
+ # Sidebar with all controls
171
+ with gr.Column(scale=1):
172
+ chatbot = gr.Chatbot(
173
+ label="Chat",
174
+ height=300,
175
+ type="messages",
176
+ elem_classes="chat-container"
177
+ )
178
+
179
+ with gr.Row():
180
+ question_input = gr.Textbox(
181
+ label="Ask about the video",
182
+ placeholder="e.g., Describe what happens in this video...",
183
+ lines=2,
184
+ scale=4
185
+ )
186
+ submit_btn = gr.Button("Send", variant="primary", scale=1)
187
+
188
+ with gr.Row():
189
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Chat")
190
+ example_btn1 = gr.Button("πŸ“ Describe")
191
+ example_btn2 = gr.Button("🎬 Action")
192
+ example_btn3 = gr.Button("πŸ‘₯ People")
193
+
194
+ with gr.Accordion("Advanced Settings", open=False):
195
+ fps_slider = gr.Slider(
196
+ minimum=1,
197
+ maximum=10,
198
+ value=3,
199
+ step=1,
200
+ label="FPS for frame extraction",
201
+ info="Higher FPS captures more detail but uses more memory"
202
+ )
203
+
204
+ force_packing_slider = gr.Slider(
205
+ minimum=0,
206
+ maximum=MAX_NUM_PACKING,
207
+ value=0,
208
+ step=1,
209
+ label="Force Packing",
210
+ info=f"0 = auto, 1-{MAX_NUM_PACKING} = force specific packing number"
211
+ )
212
+
213
+ gr.Markdown(
214
+ """
215
+ ### Video Info
216
+ - Max frames: 180 Γ— 3 packing = 540 frames
217
+ - Temporal compression: 64 tokens per video
218
+ - Supported formats: MP4, AVI, MOV, etc.
219
+ """
220
+ )
221
+
222
+ # Example questions
223
+ example_btn1.click(
224
+ lambda: "Describe this video in detail.",
225
+ outputs=question_input
226
+ )
227
+
228
+ example_btn2.click(
229
+ lambda: "What actions or events occur in this video?",
230
+ outputs=question_input
231
+ )
232
+
233
+ example_btn3.click(
234
+ lambda: "Are there any people in this video? If so, what are they doing?",
235
+ outputs=question_input
236
+ )
237
+
238
+ # Event handlers
239
+ submit_btn.click(
240
+ fn=process_video_and_question,
241
+ inputs=[video_input, question_input, fps_slider, force_packing_slider, chatbot],
242
+ outputs=[chatbot, question_input]
243
+ )
244
+
245
+ question_input.submit(
246
+ fn=process_video_and_question,
247
+ inputs=[video_input, question_input, fps_slider, force_packing_slider, chatbot],
248
+ outputs=[chatbot, question_input]
249
+ )
250
+
251
+ clear_btn.click(
252
+ fn=clear_chat,
253
+ outputs=[chatbot, video_input, question_input, fps_slider, force_packing_slider]
254
+ )
255
+
256
+ # Examples
257
+ gr.Examples(
258
+ examples=[
259
+ ["Describe what happens in this video"],
260
+ ["What is the main subject of this video?"],
261
+ ["Count the number of objects or people in the video"],
262
+ ["What emotions or mood does this video convey?"],
263
+ ["Summarize the key moments in this video"],
264
+ ],
265
+ inputs=question_input,
266
+ label="Example Questions"
267
+ )
268
+
269
+ if __name__ == "__main__":
270
+ demo.launch()