JungleGym commited on
Commit
1740e26
·
verified ·
1 Parent(s): 578b1b7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -2
README.md CHANGED
@@ -133,10 +133,24 @@ pip install flash-attn==2.7.4.post1 --no-build-isolation --no-cache-dir
133
 
134
  Using 🤗Transformers for Inference:
135
  ```python
 
 
136
  import torch
137
  from transformers import AutoModelForImageTextToText, AutoProcessor
138
  from qwen_vl_utils import process_vision_info
139
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  # Load model and processor
141
  model = AutoModelForImageTextToText.from_pretrained(
142
  "TencentARC/TimeLens-7B",
@@ -153,8 +167,8 @@ processor = AutoProcessor.from_pretrained(
153
  )
154
 
155
  # Prepare input
156
- query = "A man is sitting on a chair"
157
- video_path = "https://huggingface.co/datasets/JungleGym/TimeLens-Assets/blob/main/2Y8XQ.mp4"
158
 
159
  GROUNDER_PROMPT = "You are given a video with multiple frames. The numbers before each video frame indicate its sampling timestamp (in seconds). Please find the visual event described by the sentence '{}', determining its starting and ending times. The format should be: 'The event happens in <start time> - <end time> seconds'."
160
 
 
133
 
134
  Using 🤗Transformers for Inference:
135
  ```python
136
+ import requests
137
+ import os
138
  import torch
139
  from transformers import AutoModelForImageTextToText, AutoProcessor
140
  from qwen_vl_utils import process_vision_info
141
 
142
+
143
+ def download_video(url):
144
+ save_path = os.path.basename(url)
145
+ if not os.path.exists(save_path):
146
+ print(f"Downloading video from {url}...")
147
+ response = requests.get(url, stream=True)
148
+ response.raise_for_status()
149
+ with open(save_path, 'wb') as f:
150
+ for chunk in response.iter_content(chunk_size=8192):
151
+ f.write(chunk)
152
+ return save_path
153
+
154
  # Load model and processor
155
  model = AutoModelForImageTextToText.from_pretrained(
156
  "TencentARC/TimeLens-7B",
 
167
  )
168
 
169
  # Prepare input
170
+ query = "A man drinks water with a glass"
171
+ video_path = download_video("https://huggingface.co/datasets/JungleGym/TimeLens-Assets/resolve/main/2Y8XQ.mp4")
172
 
173
  GROUNDER_PROMPT = "You are given a video with multiple frames. The numbers before each video frame indicate its sampling timestamp (in seconds). Please find the visual event described by the sentence '{}', determining its starting and ending times. The format should be: 'The event happens in <start time> - <end time> seconds'."
174