Update README.md
Browse files
README.md
CHANGED
|
@@ -30,6 +30,7 @@ license: other
|
|
| 30 |
Here is an example of multi-modal ICL (in-context learning) with 🦦 Otter. We provide two demo images with corresponding instructions and answers, then we ask the model to generate an answer given our instruct. You may change your instruction and see how the model responds.
|
| 31 |
|
| 32 |
Please first clone [Otter](https://github.com/Luodian/Otter) to your local disk. Place following script inside the Otter folder to make sure it has the access to otter/modeling_otter.py.
|
|
|
|
| 33 |
``` python
|
| 34 |
import mimetypes
|
| 35 |
import os
|
|
@@ -60,25 +61,6 @@ def get_content_type(file_path):
|
|
| 60 |
|
| 61 |
# ------------------- Image and Video Handling Functions -------------------
|
| 62 |
|
| 63 |
-
|
| 64 |
-
def extract_frames(video_path, num_frames=16):
|
| 65 |
-
video = cv2.VideoCapture(video_path)
|
| 66 |
-
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 67 |
-
frame_step = total_frames // num_frames
|
| 68 |
-
frames = []
|
| 69 |
-
|
| 70 |
-
for i in range(num_frames):
|
| 71 |
-
video.set(cv2.CAP_PROP_POS_FRAMES, i * frame_step)
|
| 72 |
-
ret, frame = video.read()
|
| 73 |
-
if ret:
|
| 74 |
-
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 75 |
-
frame = Image.fromarray(frame).convert("RGB")
|
| 76 |
-
frames.append(frame)
|
| 77 |
-
|
| 78 |
-
video.release()
|
| 79 |
-
return frames
|
| 80 |
-
|
| 81 |
-
|
| 82 |
def get_image(url: str) -> Union[Image.Image, list]:
|
| 83 |
if "://" not in url: # Local file
|
| 84 |
content_type = get_content_type(url)
|
|
@@ -88,20 +70,8 @@ def get_image(url: str) -> Union[Image.Image, list]:
|
|
| 88 |
if "image" in content_type:
|
| 89 |
if "://" not in url: # Local file
|
| 90 |
return Image.open(url)
|
| 91 |
-
nne
|
| 92 |
else: # Remote URL
|
| 93 |
return Image.open(requests.get(url, stream=True, verify=False).raw)
|
| 94 |
-
elif "video" in content_type:
|
| 95 |
-
video_path = "temp_video.mp4"
|
| 96 |
-
if "://" not in url: # Local file
|
| 97 |
-
video_path = url
|
| 98 |
-
else: # Remote URL
|
| 99 |
-
with open(video_path, "wb") as f:
|
| 100 |
-
f.write(requests.get(url, stream=True, verify=False).content)
|
| 101 |
-
frames = extract_frames(video_path)
|
| 102 |
-
if "://" in url: # Only remove the temporary video file if it was downloaded
|
| 103 |
-
os.remove(video_path)
|
| 104 |
-
return frames
|
| 105 |
else:
|
| 106 |
raise ValueError("Invalid content type. Expected image or video.")
|
| 107 |
|
|
@@ -120,13 +90,9 @@ def get_response(image_list, prompt: str, model=None, image_processor=None, in_c
|
|
| 120 |
input_data = image_list
|
| 121 |
|
| 122 |
if isinstance(input_data, Image.Image):
|
| 123 |
-
vision_x = (
|
| 124 |
-
image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
|
| 125 |
-
)
|
| 126 |
elif isinstance(input_data, list): # list of video frames
|
| 127 |
-
vision_x = (
|
| 128 |
-
image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
|
| 129 |
-
)
|
| 130 |
else:
|
| 131 |
raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")
|
| 132 |
|
|
@@ -163,9 +129,7 @@ def get_response(image_list, prompt: str, model=None, image_processor=None, in_c
|
|
| 163 |
# ------------------- Main Function -------------------
|
| 164 |
|
| 165 |
if __name__ == "__main__":
|
| 166 |
-
model = OtterForConditionalGeneration.from_pretrained(
|
| 167 |
-
"luodian/OTTER-9B-LA-InContext", device_map="auto"
|
| 168 |
-
)
|
| 169 |
model.text_tokenizer.padding_side = "left"
|
| 170 |
tokenizer = model.text_tokenizer
|
| 171 |
image_processor = transformers.CLIPImageProcessor()
|
|
|
|
| 30 |
Here is an example of multi-modal ICL (in-context learning) with 🦦 Otter. We provide two demo images with corresponding instructions and answers, then we ask the model to generate an answer given our instruct. You may change your instruction and see how the model responds.
|
| 31 |
|
| 32 |
Please first clone [Otter](https://github.com/Luodian/Otter) to your local disk. Place following script inside the Otter folder to make sure it has the access to otter/modeling_otter.py.
|
| 33 |
+
|
| 34 |
``` python
|
| 35 |
import mimetypes
|
| 36 |
import os
|
|
|
|
| 61 |
|
| 62 |
# ------------------- Image and Video Handling Functions -------------------
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
def get_image(url: str) -> Union[Image.Image, list]:
|
| 65 |
if "://" not in url: # Local file
|
| 66 |
content_type = get_content_type(url)
|
|
|
|
| 70 |
if "image" in content_type:
|
| 71 |
if "://" not in url: # Local file
|
| 72 |
return Image.open(url)
|
|
|
|
| 73 |
else: # Remote URL
|
| 74 |
return Image.open(requests.get(url, stream=True, verify=False).raw)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
else:
|
| 76 |
raise ValueError("Invalid content type. Expected image or video.")
|
| 77 |
|
|
|
|
| 90 |
input_data = image_list
|
| 91 |
|
| 92 |
if isinstance(input_data, Image.Image):
|
| 93 |
+
vision_x = image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
|
|
|
|
|
|
|
| 94 |
elif isinstance(input_data, list): # list of video frames
|
| 95 |
+
vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
|
|
|
|
|
|
|
| 96 |
else:
|
| 97 |
raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")
|
| 98 |
|
|
|
|
| 129 |
# ------------------- Main Function -------------------
|
| 130 |
|
| 131 |
if __name__ == "__main__":
|
| 132 |
+
model = OtterForConditionalGeneration.from_pretrained("luodian/OTTER-9B-LA-InContext", device_map="auto")
|
|
|
|
|
|
|
| 133 |
model.text_tokenizer.padding_side = "left"
|
| 134 |
tokenizer = model.text_tokenizer
|
| 135 |
image_processor = transformers.CLIPImageProcessor()
|