| | ---
|
| | license: cc-by-sa-4.0
|
| | ---
|
| |
|
| | ```bash
|
| | pip install transformers==4.51.0
|
| | ```
|
| |
|
| |
|
| | ## single image
|
| |
|
| | ```python
|
| | from PIL import Image
|
| | import requests
|
| | from transformers import AutoProcessor, AutoModel
|
| | import torch
|
| | model = AutoModel.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview",trust_remote_code=True, torch_dtype=torch.bfloat16)
|
| | processor = AutoProcessor.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview", trust_remote_code=True, use_fast=True)
|
| | processor.tokenizer.padding_side = "left"
|
| |
|
| | messages = [
|
| | {
|
| | "role": "user",
|
| | "content": [
|
| | {
|
| | "type": "image",
|
| | "image": "https://www.ilankelman.org/stopsigns/australia.jpg",
|
| | },
|
| | {"type": "text", "text": "Describe this image."},
|
| | ],
|
| | }
|
| | ]
|
| |
|
| | text_list = [processor.apply_chat_template(
|
| | messages, tokenize=False, add_generation_prompt=True
|
| | )]
|
| | image_inputs, video_inputs = processor.process_vision_info(messages)
|
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True)
|
| | inputs = inputs.to("cuda")
|
| | model = model.to("cuda")
|
| | generated_ids = model.generate(**inputs, max_new_tokens=1024)
|
| | output_text = processor.batch_decode(
|
| | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| | )
|
| | print(output_text)
|
| | ```
|
| |
|
| | ## stream generation
|
| |
|
| | ```python
|
| | from PIL import Image
|
| | import requests
|
| | from transformers import AutoProcessor, AutoModel, AutoTokenizer
|
| | import torch
|
| |
|
| | from transformers import TextIteratorStreamer
|
| | import threading
|
| |
|
| |
|
| | model = AutoModel.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview",trust_remote_code=True, attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16)
|
| | tokenizer = AutoTokenizer.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview", trust_remote_code=True, use_fast=True)
|
| | processor = AutoProcessor.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview", trust_remote_code=True, use_fast=True)
|
| | processor.tokenizer.padding_side = "left"
|
| |
|
| | messages = [
|
| | {
|
| | "role": "user",
|
| | "content": [
|
| | {
|
| | "type": "image",
|
| | "image": "https://www.ilankelman.org/stopsigns/australia.jpg",
|
| | },
|
| | {"type": "text", "text": "Describe this image."},
|
| | ],
|
| | }
|
| | ]
|
| |
|
| | text_list = [processor.apply_chat_template(
|
| | messages, tokenize=False, add_generation_prompt=True
|
| | )]
|
| | image_inputs, video_inputs = processor.process_vision_info(messages)
|
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True)
|
| | inputs = inputs.to("cuda")
|
| | model = model.to("cuda")
|
| |
|
| | streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| |
|
| | generation_kwargs = dict(
|
| | **inputs,
|
| | streamer=streamer,
|
| | max_new_tokens=1024,
|
| | do_sample=True,
|
| | top_p=0.95,
|
| | temperature=0.8
|
| | )
|
| | thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
| | thread.start()
|
| |
|
| |
|
| | for new_text in streamer:
|
| | print(new_text, end="", flush=True)
|
| | ```
|
| |
|
| | ## multiple-images
|
| |
|
| | ```python
|
| | from PIL import Image
|
| | import requests
|
| | from transformers import AutoProcessor, AutoModel
|
| | import torch
|
| | model = AutoModel.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview",trust_remote_code=True, torch_dtype=torch.bfloat16)
|
| | processor = AutoProcessor.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview", trust_remote_code=True, use_fast=True)
|
| | processor.tokenizer.padding_side = "left"
|
| |
|
| | messages = [
|
| | {
|
| | "role": "user",
|
| | "content": [
|
| | {
|
| | "type": "image",
|
| | "image": "https://www.ilankelman.org/stopsigns/australia.jpg",
|
| | },
|
| | {
|
| | "type": "image",
|
| | "image": "https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png",
|
| | },
|
| | {"type": "text", "text": "Describe these two images."},
|
| | ],
|
| | }
|
| | ]
|
| |
|
| | text_list = [processor.apply_chat_template(
|
| | messages, tokenize=False, add_generation_prompt=True
|
| | )]
|
| | image_inputs, video_inputs = processor.process_vision_info(messages)
|
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True)
|
| | inputs = inputs.to("cuda")
|
| | model = model.to("cuda")
|
| | generated_ids = model.generate(**inputs, max_new_tokens=1024)
|
| | output_text = processor.batch_decode(
|
| | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| | )
|
| | print(output_text)
|
| | ```
|
| |
|
| | ## single video
|
| |
|
| | ```python
|
| |
|
| | from PIL import Image
|
| | import requests
|
| | from transformers import AutoProcessor, AutoModel
|
| | import torch
|
| | model = AutoModel.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview",trust_remote_code=True, torch_dtype=torch.bfloat16)
|
| | processor = AutoProcessor.from_pretrained("/home/zhidingy/workspace/libs/eagle/Eagle2/work_dirs/Eagle2.5-VL-8B-Preview", trust_remote_code=True, use_fast=True)
|
| | processor.tokenizer.padding_side = "left"
|
| |
|
| | messages = [
|
| | {
|
| | "role": "user",
|
| | "content": [
|
| | {
|
| | "type": "video",
|
| | "video": "../Eagle2-8B/space_woaudio.mp4",
|
| | },
|
| | {"type": "text", "text": "Describe this video."},
|
| | ],
|
| | }
|
| | ]
|
| |
|
| | text_list = [processor.apply_chat_template(
|
| | messages, tokenize=False, add_generation_prompt=True
|
| | )]
|
| | image_inputs, video_inputs, video_kwargs = processor.process_vision_info(messages, return_video_kwargs=True)
|
| |
|
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True, videos_kwargs=video_kwargs)
|
| | inputs = inputs.to("cuda")
|
| | model = model.to("cuda")
|
| | generated_ids = model.generate(**inputs, max_new_tokens=1024)
|
| | output_text = processor.batch_decode(
|
| | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| | )
|
| | print(output_text)
|
| |
|
| | ```
|
| |
|
| | ## multieple videos
|
| |
|
| | ```python
|
| | from PIL import Image
|
| | import requests
|
| | from transformers import AutoProcessor, AutoModel
|
| | import torch
|
| | model = AutoModel.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview",trust_remote_code=True, torch_dtype=torch.bfloat16)
|
| | processor = AutoProcessor.from_pretrained("/home/zhidingy/workspace/libs/eagle/Eagle2/work_dirs/Eagle2.5-VL-8B-Preview", trust_remote_code=True, use_fast=True)
|
| | processor.tokenizer.padding_side = "left"
|
| |
|
| | messages = [
|
| | {
|
| | "role": "user",
|
| | "content": [
|
| | {
|
| | "type": "video",
|
| | "video": "../Eagle2-8B/space_woaudio.mp4",
|
| | "nframes": 10,
|
| | },
|
| | {
|
| | "type": "video",
|
| | "video": "../Eagle2-8B/video_ocr.mp4",
|
| | "nframes": 10,
|
| | },
|
| | {"type": "text", "text": "Describe these two videos respectively."},
|
| | ],
|
| | }
|
| | ]
|
| |
|
| | text_list = [processor.apply_chat_template(
|
| | messages, tokenize=False, add_generation_prompt=True
|
| | )]
|
| | image_inputs, video_inputs, video_kwargs = processor.process_vision_info(messages, return_video_kwargs=True)
|
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True, videos_kwargs=video_kwargs)
|
| | inputs = inputs.to("cuda")
|
| | model = model.to("cuda")
|
| | generated_ids = model.generate(**inputs, max_new_tokens=1024)
|
| | output_text = processor.batch_decode(
|
| | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| | )
|
| | print(output_text)
|
| | ```
|
| |
|
| | ## batch inference
|
| |
|
| | ```python
|
| | from PIL import Image
|
| | import requests
|
| | from transformers import AutoProcessor, AutoModel
|
| | import torch
|
| | model = AutoModel.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview",trust_remote_code=True, torch_dtype=torch.bfloat16)
|
| | processor = AutoProcessor.from_pretrained("NVEagle/Eagle2.5-VL-8B-Preview", trust_remote_code=True, use_fast=True)
|
| | processor.tokenizer.padding_side = "left"
|
| |
|
| | messages1 = [
|
| | {
|
| | "role": "user",
|
| | "content": [
|
| | {
|
| | "type": "image",
|
| | "image": "https://www.ilankelman.org/stopsigns/australia.jpg",
|
| | },
|
| | {"type": "text", "text": "Describe this image."},
|
| | ],
|
| | }
|
| | ]
|
| |
|
| | messages2 = [
|
| | {
|
| | "role": "user",
|
| | "content": [
|
| | {
|
| | "type": "image",
|
| | "image": "https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png",
|
| | },
|
| | {"type": "text", "text": "Describe this image."},
|
| | ],
|
| | }
|
| | ]
|
| |
|
| | text_list = [processor.apply_chat_template(
|
| | messages, tokenize=False, add_generation_prompt=True
|
| | ) for messages in [messages1, messages2]]
|
| | image_inputs, video_inputs = processor.process_vision_info([messages1, messages2])
|
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True)
|
| | inputs = inputs.to("cuda")
|
| | model = model.to("cuda")
|
| | generated_ids = model.generate(**inputs, max_new_tokens=1024)
|
| | output_text = processor.batch_decode(
|
| | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| | )
|
| | print(output_text)
|
| | ``` |