Update README.md
Browse files
README.md
CHANGED
|
@@ -155,10 +155,10 @@ model.eval()
|
|
| 155 |
video_path = "XXXX"
|
| 156 |
max_frames_num = 64
|
| 157 |
video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
|
| 158 |
-
video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().
|
| 159 |
video = [video]
|
| 160 |
conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
|
| 161 |
-
|
| 162 |
perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
|
| 163 |
task_prompt = "Describe in details what you see from the video frames."
|
| 164 |
question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"
|
|
|
|
| 155 |
video_path = "XXXX"
|
| 156 |
max_frames_num = 64
|
| 157 |
video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
|
| 158 |
+
video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
|
| 159 |
video = [video]
|
| 160 |
conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
|
| 161 |
+
time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
|
| 162 |
perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
|
| 163 |
task_prompt = "Describe in details what you see from the video frames."
|
| 164 |
question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"
|