nkkbr
/

ViCA2

       value: 66.50
       name: Appearance Order
 ---
+**Currently under editing.**
+##  Installation
+```bash
+git clone https://github.com/nkkbr/ViCA.git
+cd ViCA
+conda create -n vica2 python=3.10 -y
+conda activate vica2
+# Install dependencies (with CUDA 12.1 support)
+pip install --extra-index-url https://download.pytorch.org/whl/cu121 -e .
+# FlashAttention is required and may need to be installed separately
+pip install flash-attn==2.5.7
+```
+## Inference
+*Here is a runnable example using ViCA2-7B on a VSI-Bench question.*
+>  **Note**: ViCA and ViCA2 use different model architectures. Please make sure to use the corresponding code for inference.
+```python
+# This inference script is adapted from:
+# https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2
+from vica2.model.builder import load_pretrained_model
+from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
+from llava.conversation import conv_templates, SeparatorStyle
+from PIL import Image
+import requests
+import copy
+import torch
+import sys
+import warnings
+from decord import VideoReader, cpu
+import numpy as np
+warnings.filterwarnings("ignore")
+def load_video(video_path, max_frames_num,fps=1,force_sample=False):
+    if max_frames_num == 0:
+        return np.zeros((1, 336, 336, 3))
+    vr = VideoReader(video_path, ctx=cpu(0),num_threads=1)
+    total_frame_num = len(vr)
+    video_time = total_frame_num / vr.get_avg_fps()
+    fps = round(vr.get_avg_fps()/fps)
+    frame_idx = [i for i in range(0, len(vr), fps)]
+    frame_time = [i/fps for i in frame_idx]
+    if len(frame_idx) > max_frames_num or force_sample:
+        sample_fps = max_frames_num
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+        frame_time = [i/vr.get_avg_fps() for i in frame_idx]
+    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
+    spare_frames = vr.get_batch(frame_idx).asnumpy()
+    return spare_frames,frame_time,video_time
+pretrained = "nkkbr/ViCA2-stage2-onevision-ft"
+model_name = "vica_qwen"
+device = "cuda"
+device_map = "auto"
+tokenizer, model, image_processor, image_processor_for_sam, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)
+model.eval()
+from datasets import load_dataset
+vsi_bench = load_dataset("nyu-visionx/VSI-Bench")
+vsi_bench = vsi_bench['test']
+data_curr = vsi_bench[90]
+video_path = f"[VIDEO PATH]"
+max_frames_num = 64
+video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
+video1= image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
+video1 = [video1]
+video2 = image_processor_for_sam.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
+video2 = [video2]
+conv_template = "qwen_1_5"
+# time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
+time_instruciton = ""
+question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruciton}\n\n"
+question += f"These are frames of a video.\n\n"
+question += f"Question: {data_curr['question']}\n"
+if data_curr['options'] is not None:
+    question += '\n'.join(data_curr['options']) + "\n"
+    question += f"Answer with the option’s letter from the given choices directly.\n"
+else:
+    question += f"Please answer the question using a single word or phrase.\n"
+print(f"Prompt:\n{question}")
+conv = copy.deepcopy(conv_templates[conv_template])
+conv.append_message(conv.roles[0], question)
+conv.append_message(conv.roles[1], None)
+prompt_question = conv.get_prompt()
+input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
+cont = model.generate(
+    input_ids,
+    images=video1,
+    images_for_sam=video2,
+    modalities= ["video"],
+    do_sample=False,
+    temperature=0,
+    max_new_tokens=1024,
+)
+text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)[0].strip()
+print(repr(text_outputs))
+```
+---