luodian commited on
Commit
8a008bc
·
1 Parent(s): 2154c33

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +4 -40
README.md CHANGED
@@ -30,6 +30,7 @@ license: other
30
  Here is an example of multi-modal ICL (in-context learning) with 🦦 Otter. We provide two demo images with corresponding instructions and answers, then we ask the model to generate an answer given our instruct. You may change your instruction and see how the model responds.
31
 
32
  Please first clone [Otter](https://github.com/Luodian/Otter) to your local disk. Place following script inside the Otter folder to make sure it has the access to otter/modeling_otter.py.
 
33
  ``` python
34
  import mimetypes
35
  import os
@@ -60,25 +61,6 @@ def get_content_type(file_path):
60
 
61
  # ------------------- Image and Video Handling Functions -------------------
62
 
63
-
64
- def extract_frames(video_path, num_frames=16):
65
- video = cv2.VideoCapture(video_path)
66
- total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
67
- frame_step = total_frames // num_frames
68
- frames = []
69
-
70
- for i in range(num_frames):
71
- video.set(cv2.CAP_PROP_POS_FRAMES, i * frame_step)
72
- ret, frame = video.read()
73
- if ret:
74
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
75
- frame = Image.fromarray(frame).convert("RGB")
76
- frames.append(frame)
77
-
78
- video.release()
79
- return frames
80
-
81
-
82
  def get_image(url: str) -> Union[Image.Image, list]:
83
  if "://" not in url: # Local file
84
  content_type = get_content_type(url)
@@ -88,20 +70,8 @@ def get_image(url: str) -> Union[Image.Image, list]:
88
  if "image" in content_type:
89
  if "://" not in url: # Local file
90
  return Image.open(url)
91
- nne
92
  else: # Remote URL
93
  return Image.open(requests.get(url, stream=True, verify=False).raw)
94
- elif "video" in content_type:
95
- video_path = "temp_video.mp4"
96
- if "://" not in url: # Local file
97
- video_path = url
98
- else: # Remote URL
99
- with open(video_path, "wb") as f:
100
- f.write(requests.get(url, stream=True, verify=False).content)
101
- frames = extract_frames(video_path)
102
- if "://" in url: # Only remove the temporary video file if it was downloaded
103
- os.remove(video_path)
104
- return frames
105
  else:
106
  raise ValueError("Invalid content type. Expected image or video.")
107
 
@@ -120,13 +90,9 @@ def get_response(image_list, prompt: str, model=None, image_processor=None, in_c
120
  input_data = image_list
121
 
122
  if isinstance(input_data, Image.Image):
123
- vision_x = (
124
- image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
125
- )
126
  elif isinstance(input_data, list): # list of video frames
127
- vision_x = (
128
- image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
129
- )
130
  else:
131
  raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")
132
 
@@ -163,9 +129,7 @@ def get_response(image_list, prompt: str, model=None, image_processor=None, in_c
163
  # ------------------- Main Function -------------------
164
 
165
  if __name__ == "__main__":
166
- model = OtterForConditionalGeneration.from_pretrained(
167
- "luodian/OTTER-9B-LA-InContext", device_map="auto"
168
- )
169
  model.text_tokenizer.padding_side = "left"
170
  tokenizer = model.text_tokenizer
171
  image_processor = transformers.CLIPImageProcessor()
 
30
  Here is an example of multi-modal ICL (in-context learning) with 🦦 Otter. We provide two demo images with corresponding instructions and answers, then we ask the model to generate an answer given our instruct. You may change your instruction and see how the model responds.
31
 
32
  Please first clone [Otter](https://github.com/Luodian/Otter) to your local disk. Place following script inside the Otter folder to make sure it has the access to otter/modeling_otter.py.
33
+
34
  ``` python
35
  import mimetypes
36
  import os
 
61
 
62
  # ------------------- Image and Video Handling Functions -------------------
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def get_image(url: str) -> Union[Image.Image, list]:
65
  if "://" not in url: # Local file
66
  content_type = get_content_type(url)
 
70
  if "image" in content_type:
71
  if "://" not in url: # Local file
72
  return Image.open(url)
 
73
  else: # Remote URL
74
  return Image.open(requests.get(url, stream=True, verify=False).raw)
 
 
 
 
 
 
 
 
 
 
 
75
  else:
76
  raise ValueError("Invalid content type. Expected image or video.")
77
 
 
90
  input_data = image_list
91
 
92
  if isinstance(input_data, Image.Image):
93
+ vision_x = image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
 
 
94
  elif isinstance(input_data, list): # list of video frames
95
+ vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
 
 
96
  else:
97
  raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")
98
 
 
129
  # ------------------- Main Function -------------------
130
 
131
  if __name__ == "__main__":
132
+ model = OtterForConditionalGeneration.from_pretrained("luodian/OTTER-9B-LA-InContext", device_map="auto")
 
 
133
  model.text_tokenizer.padding_side = "left"
134
  tokenizer = model.text_tokenizer
135
  image_processor = transformers.CLIPImageProcessor()