MLAdaptiveIntelligence
/

LLaVAction-0.5B

Video-Text-to-Text

text-generation

text-generation-inference

Model card Files Files and versions

shaokaiye commited on Mar 24

Commit

e3b2b5e

·

verified ·

1 Parent(s): 92521ab

Update README.md

Files changed (1) hide show

README.md +1 -1

README.md CHANGED Viewed

@@ -81,7 +81,7 @@ video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sa
 video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().half()
 video = [video]
 conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
-time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
 perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
 task_prompt = "Describe in details what you see from the video frames."
 question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"

 video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().half()
 video = [video]
 conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
+time_instruction = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. "
 perspective_prompt = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
 task_prompt = "Describe in details what you see from the video frames."
 question = DEFAULT_IMAGE_TOKEN + f"\n{time_instruction}\n{perspective_prompt} {task_prompt}"