nkkbr commited on
Commit
8fcbc3e
·
1 Parent(s): d026eab

update readme

Browse files
Files changed (1) hide show
  1. README.md +115 -0
README.md CHANGED
@@ -74,3 +74,118 @@ model-index:
74
  value: 66.50
75
  name: Appearance Order
76
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  value: 66.50
75
  name: Appearance Order
76
  ---
77
+
78
+ **Currently under editing.**
79
+
80
+ ## Installation
81
+
82
+ ```bash
83
+ git clone https://github.com/nkkbr/ViCA.git
84
+ cd ViCA
85
+
86
+ conda create -n vica2 python=3.10 -y
87
+ conda activate vica2
88
+
89
+ # Install dependencies (with CUDA 12.1 support)
90
+ pip install --extra-index-url https://download.pytorch.org/whl/cu121 -e .
91
+
92
+ # FlashAttention is required and may need to be installed separately
93
+ pip install flash-attn==2.5.7
94
+ ```
95
+
96
+ ## Inference
97
+
98
+ *Here is a runnable example using ViCA2-7B on a VSI-Bench question.*
99
+
100
+ > **Note**: ViCA and ViCA2 use different model architectures. Please make sure to use the corresponding code for inference.
101
+
102
+ ```python
103
+ # This inference script is adapted from:
104
+ # https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2
105
+
106
+ from vica2.model.builder import load_pretrained_model
107
+ from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
108
+ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
109
+ from llava.conversation import conv_templates, SeparatorStyle
110
+ from PIL import Image
111
+ import requests
112
+ import copy
113
+ import torch
114
+ import sys
115
+ import warnings
116
+ from decord import VideoReader, cpu
117
+ import numpy as np
118
+
119
+ warnings.filterwarnings("ignore")
120
+ def load_video(video_path, max_frames_num,fps=1,force_sample=False):
121
+ if max_frames_num == 0:
122
+ return np.zeros((1, 336, 336, 3))
123
+ vr = VideoReader(video_path, ctx=cpu(0),num_threads=1)
124
+ total_frame_num = len(vr)
125
+ video_time = total_frame_num / vr.get_avg_fps()
126
+ fps = round(vr.get_avg_fps()/fps)
127
+ frame_idx = [i for i in range(0, len(vr), fps)]
128
+ frame_time = [i/fps for i in frame_idx]
129
+ if len(frame_idx) > max_frames_num or force_sample:
130
+ sample_fps = max_frames_num
131
+ uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
132
+ frame_idx = uniform_sampled_frames.tolist()
133
+ frame_time = [i/vr.get_avg_fps() for i in frame_idx]
134
+ frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
135
+ spare_frames = vr.get_batch(frame_idx).asnumpy()
136
+ return spare_frames,frame_time,video_time
137
+
138
+ pretrained = "nkkbr/ViCA2-stage2-onevision-ft"
139
+ model_name = "vica_qwen"
140
+ device = "cuda"
141
+ device_map = "auto"
142
+ tokenizer, model, image_processor, image_processor_for_sam, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)
143
+ model.eval()
144
+
145
+
146
+ from datasets import load_dataset
147
+ vsi_bench = load_dataset("nyu-visionx/VSI-Bench")
148
+ vsi_bench = vsi_bench['test']
149
+
150
+ data_curr = vsi_bench[90]
151
+
152
+ video_path = f"[VIDEO PATH]"
153
+ max_frames_num = 64
154
+ video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
155
+
156
+ video1= image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
157
+ video1 = [video1]
158
+ video2 = image_processor_for_sam.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
159
+ video2 = [video2]
160
+ conv_template = "qwen_1_5"
161
+ # time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
162
+ time_instruciton = ""
163
+ question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruciton}\n\n"
164
+ question += f"These are frames of a video.\n\n"
165
+ question += f"Question: {data_curr['question']}\n"
166
+ if data_curr['options'] is not None:
167
+ question += '\n'.join(data_curr['options']) + "\n"
168
+ question += f"Answer with the option’s letter from the given choices directly.\n"
169
+ else:
170
+ question += f"Please answer the question using a single word or phrase.\n"
171
+ print(f"Prompt:\n{question}")
172
+
173
+ conv = copy.deepcopy(conv_templates[conv_template])
174
+ conv.append_message(conv.roles[0], question)
175
+ conv.append_message(conv.roles[1], None)
176
+ prompt_question = conv.get_prompt()
177
+ input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
178
+ cont = model.generate(
179
+ input_ids,
180
+ images=video1,
181
+ images_for_sam=video2,
182
+ modalities= ["video"],
183
+ do_sample=False,
184
+ temperature=0,
185
+ max_new_tokens=1024,
186
+ )
187
+ text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)[0].strip()
188
+ print(repr(text_outputs))
189
+ ```
190
+
191
+ ---