luodian
/

OTTER-Image-LLaMA7B-LA-InContext

Transformers

PyTorch

otter

Model card Files Files and versions

xet

Community

luodian commited on Jun 25, 2023

Commit

8a008bc

1 Parent(s): 2154c33

Update README.md

Browse files

Files changed (1) hide show

README.md +4 -40

README.md CHANGED Viewed

@@ -30,6 +30,7 @@ license: other
 Here is an example of multi-modal ICL (in-context learning) with 🦦 Otter. We provide two demo images with corresponding instructions and answers, then we ask the model to generate an answer given our instruct. You may change your instruction and see how the model responds.
 Please first clone [Otter](https://github.com/Luodian/Otter) to your local disk. Place following script inside the Otter folder to make sure it has the access to otter/modeling_otter.py.
 ``` python
 import mimetypes
 import os
@@ -60,25 +61,6 @@ def get_content_type(file_path):
 # ------------------- Image and Video Handling Functions -------------------
-def extract_frames(video_path, num_frames=16):
-    video = cv2.VideoCapture(video_path)
-    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
-    frame_step = total_frames // num_frames
-    frames = []
-    for i in range(num_frames):
-        video.set(cv2.CAP_PROP_POS_FRAMES, i * frame_step)
-        ret, frame = video.read()
-        if ret:
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            frame = Image.fromarray(frame).convert("RGB")
-            frames.append(frame)
-    video.release()
-    return frames
 def get_image(url: str) -> Union[Image.Image, list]:
     if "://" not in url:  # Local file
         content_type = get_content_type(url)
@@ -88,20 +70,8 @@ def get_image(url: str) -> Union[Image.Image, list]:
     if "image" in content_type:
         if "://" not in url:  # Local file
             return Image.open(url)
-            nne
         else:  # Remote URL
             return Image.open(requests.get(url, stream=True, verify=False).raw)
-    elif "video" in content_type:
-        video_path = "temp_video.mp4"
-        if "://" not in url:  # Local file
-            video_path = url
-        else:  # Remote URL
-            with open(video_path, "wb") as f:
-                f.write(requests.get(url, stream=True, verify=False).content)
-        frames = extract_frames(video_path)
-        if "://" in url:  # Only remove the temporary video file if it was downloaded
-            os.remove(video_path)
-        return frames
     else:
         raise ValueError("Invalid content type. Expected image or video.")
@@ -120,13 +90,9 @@ def get_response(image_list, prompt: str, model=None, image_processor=None, in_c
     input_data = image_list
     if isinstance(input_data, Image.Image):
-        vision_x = (
-            image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
-        )
     elif isinstance(input_data, list):  # list of video frames
-        vision_x = (
-            image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
-        )
     else:
         raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")
@@ -163,9 +129,7 @@ def get_response(image_list, prompt: str, model=None, image_processor=None, in_c
 # ------------------- Main Function -------------------
 if __name__ == "__main__":
-    model = OtterForConditionalGeneration.from_pretrained(
-        "luodian/OTTER-9B-LA-InContext", device_map="auto"
-    )
     model.text_tokenizer.padding_side = "left"
     tokenizer = model.text_tokenizer
     image_processor = transformers.CLIPImageProcessor()

 Here is an example of multi-modal ICL (in-context learning) with 🦦 Otter. We provide two demo images with corresponding instructions and answers, then we ask the model to generate an answer given our instruct. You may change your instruction and see how the model responds.
 Please first clone [Otter](https://github.com/Luodian/Otter) to your local disk. Place following script inside the Otter folder to make sure it has the access to otter/modeling_otter.py.
 ``` python
 import mimetypes
 import os
 # ------------------- Image and Video Handling Functions -------------------
 def get_image(url: str) -> Union[Image.Image, list]:
     if "://" not in url:  # Local file
         content_type = get_content_type(url)
     if "image" in content_type:
         if "://" not in url:  # Local file
             return Image.open(url)
         else:  # Remote URL
             return Image.open(requests.get(url, stream=True, verify=False).raw)
     else:
         raise ValueError("Invalid content type. Expected image or video.")
     input_data = image_list
     if isinstance(input_data, Image.Image):
+        vision_x = image_processor.preprocess([input_data], return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
     elif isinstance(input_data, list):  # list of video frames
+        vision_x = image_processor.preprocess(input_data, return_tensors="pt")["pixel_values"].unsqueeze(1).unsqueeze(0)
     else:
         raise ValueError("Invalid input data. Expected PIL Image or list of video frames.")
 # ------------------- Main Function -------------------
 if __name__ == "__main__":
+    model = OtterForConditionalGeneration.from_pretrained("luodian/OTTER-9B-LA-InContext", device_map="auto")
     model.text_tokenizer.padding_side = "left"
     tokenizer = model.text_tokenizer
     image_processor = transformers.CLIPImageProcessor()