THUdyh
/

Oryx-7B

@@ -63,8 +63,8 @@ device = "cuda"
 device_map = "auto"
 tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map)
 model.eval()
-video_path = ""
-max_frames_num = "64"
 video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
 video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
 video = [video]
@@ -77,11 +77,11 @@ conv.append_message(conv.roles[0], question)
 conv.append_message(conv.roles[1], None)
 prompt_question = conv.get_prompt()
 input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
-output_ids = model.generate(
     inputs=input_ids,
     images=input_data[0][0],
     images_highres=input_data[0][1],
-    modalities=video_data[2],
     do_sample=False,
     temperature=0,
     max_new_tokens=128,

 device_map = "auto"
 tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map)
 model.eval()
+video_path = "your_path_to_a_video_file"
+max_frames_num = 64
 video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
 video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
 video = [video]
 conv.append_message(conv.roles[1], None)
 prompt_question = conv.get_prompt()
 input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
+cont = model.generate(
     inputs=input_ids,
     images=input_data[0][0],
     images_highres=input_data[0][1],
+    modalities=input_data[2],
     do_sample=False,
     temperature=0,
     max_new_tokens=128,