| | --- |
| | license: cc-by-sa-4.0 |
| | --- |
| | |
| | ```bash |
| | pip install transformers==4.51.0 |
| | ``` |
| |
|
| |
|
| | ## single image |
| |
|
| | ```python |
| | from PIL import Image |
| | import requests |
| | from transformers import AutoProcessor, AutoModel |
| | import torch |
| | model = AutoModel.from_pretrained("nvidia/Eagle-2.5-8B",trust_remote_code=True, torch_dtype=torch.bfloat16) |
| | processor = AutoProcessor.from_pretrained("nvidia/Eagle-2.5-8B", trust_remote_code=True, use_fast=True) |
| | processor.tokenizer.padding_side = "left" |
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "image", |
| | "image": "https://www.ilankelman.org/stopsigns/australia.jpg", |
| | }, |
| | {"type": "text", "text": "Describe this image."}, |
| | ], |
| | } |
| | ] |
| | |
| | text_list = [processor.apply_chat_template( |
| | messages, tokenize=False, add_generation_prompt=True |
| | )] |
| | image_inputs, video_inputs = processor.process_vision_info(messages) |
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True) |
| | inputs = inputs.to("cuda") |
| | model = model.to("cuda") |
| | generated_ids = model.generate(**inputs, max_new_tokens=1024) |
| | output_text = processor.batch_decode( |
| | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| | ) |
| | print(output_text) |
| | ``` |
| |
|
| | ## stream generation |
| |
|
| | ```python |
| | from PIL import Image |
| | import requests |
| | from transformers import AutoProcessor, AutoModel, AutoTokenizer |
| | import torch |
| | |
| | from transformers import TextIteratorStreamer |
| | import threading |
| | |
| | |
| | model = AutoModel.from_pretrained("nvidia/Eagle-2.5-8B",trust_remote_code=True, attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16) |
| | tokenizer = AutoTokenizer.from_pretrained("nvidia/Eagle-2.5-8B", trust_remote_code=True, use_fast=True) |
| | processor = AutoProcessor.from_pretrained("nvidia/Eagle-2.5-8B", trust_remote_code=True, use_fast=True) |
| | processor.tokenizer.padding_side = "left" |
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "image", |
| | "image": "https://www.ilankelman.org/stopsigns/australia.jpg", |
| | }, |
| | {"type": "text", "text": "Describe this image."}, |
| | ], |
| | } |
| | ] |
| | |
| | text_list = [processor.apply_chat_template( |
| | messages, tokenize=False, add_generation_prompt=True |
| | )] |
| | image_inputs, video_inputs = processor.process_vision_info(messages) |
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True) |
| | inputs = inputs.to("cuda") |
| | model = model.to("cuda") |
| | |
| | streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
| | |
| | generation_kwargs = dict( |
| | **inputs, |
| | streamer=streamer, |
| | max_new_tokens=1024, |
| | do_sample=True, |
| | top_p=0.95, |
| | temperature=0.8 |
| | ) |
| | thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) |
| | thread.start() |
| | |
| | |
| | for new_text in streamer: |
| | print(new_text, end="", flush=True) |
| | ``` |
| |
|
| | ## multiple-images |
| |
|
| | ```python |
| | from PIL import Image |
| | import requests |
| | from transformers import AutoProcessor, AutoModel |
| | import torch |
| | model = AutoModel.from_pretrained("nvidia/Eagle-2.5-8B",trust_remote_code=True, torch_dtype=torch.bfloat16) |
| | processor = AutoProcessor.from_pretrained("nvidia/Eagle-2.5-8B", trust_remote_code=True, use_fast=True) |
| | processor.tokenizer.padding_side = "left" |
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "image", |
| | "image": "https://www.ilankelman.org/stopsigns/australia.jpg", |
| | }, |
| | { |
| | "type": "image", |
| | "image": "https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png", |
| | }, |
| | {"type": "text", "text": "Describe these two images."}, |
| | ], |
| | } |
| | ] |
| | |
| | text_list = [processor.apply_chat_template( |
| | messages, tokenize=False, add_generation_prompt=True |
| | )] |
| | image_inputs, video_inputs = processor.process_vision_info(messages) |
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True) |
| | inputs = inputs.to("cuda") |
| | model = model.to("cuda") |
| | generated_ids = model.generate(**inputs, max_new_tokens=1024) |
| | output_text = processor.batch_decode( |
| | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| | ) |
| | print(output_text) |
| | ``` |
| |
|
| | ## single video |
| |
|
| | ```python |
| | |
| | from PIL import Image |
| | import requests |
| | from transformers import AutoProcessor, AutoModel |
| | import torch |
| | model = AutoModel.from_pretrained("nvidia/Eagle-2.5-8B",trust_remote_code=True, torch_dtype=torch.bfloat16) |
| | processor = AutoProcessor.from_pretrained("nvidia/Eagle-2.5-8B", trust_remote_code=True, use_fast=True) |
| | processor.tokenizer.padding_side = "left" |
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "video", |
| | "video": "../Eagle2-8B/space_woaudio.mp4", |
| | }, |
| | {"type": "text", "text": "Describe this video."}, |
| | ], |
| | } |
| | ] |
| | |
| | text_list = [processor.apply_chat_template( |
| | messages, tokenize=False, add_generation_prompt=True |
| | )] |
| | image_inputs, video_inputs, video_kwargs = processor.process_vision_info(messages, return_video_kwargs=True) |
| | |
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True, videos_kwargs=video_kwargs) |
| | inputs = inputs.to("cuda") |
| | model = model.to("cuda") |
| | generated_ids = model.generate(**inputs, max_new_tokens=1024) |
| | output_text = processor.batch_decode( |
| | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| | ) |
| | print(output_text) |
| | |
| | ``` |
| |
|
| | ## multieple videos |
| |
|
| | ```python |
| | from PIL import Image |
| | import requests |
| | from transformers import AutoProcessor, AutoModel |
| | import torch |
| | model = AutoModel.from_pretrained("nvidia/Eagle-2.5-8B",trust_remote_code=True, torch_dtype=torch.bfloat16) |
| | processor = AutoProcessor.from_pretrained("nvidia/Eagle-2.5-8B", trust_remote_code=True, use_fast=True) |
| | processor.tokenizer.padding_side = "left" |
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "video", |
| | "video": "../Eagle2-8B/space_woaudio.mp4", |
| | "nframes": 10, |
| | }, |
| | { |
| | "type": "video", |
| | "video": "../Eagle2-8B/video_ocr.mp4", |
| | "nframes": 10, |
| | }, |
| | {"type": "text", "text": "Describe these two videos respectively."}, |
| | ], |
| | } |
| | ] |
| | |
| | text_list = [processor.apply_chat_template( |
| | messages, tokenize=False, add_generation_prompt=True |
| | )] |
| | image_inputs, video_inputs, video_kwargs = processor.process_vision_info(messages, return_video_kwargs=True) |
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True, videos_kwargs=video_kwargs) |
| | inputs = inputs.to("cuda") |
| | model = model.to("cuda") |
| | generated_ids = model.generate(**inputs, max_new_tokens=1024) |
| | output_text = processor.batch_decode( |
| | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| | ) |
| | print(output_text) |
| | ``` |
| |
|
| | ## batch inference |
| |
|
| | ```python |
| | from PIL import Image |
| | import requests |
| | from transformers import AutoProcessor, AutoModel |
| | import torch |
| | model = AutoModel.from_pretrained("nvidia/Eagle-2.5-8B",trust_remote_code=True, torch_dtype=torch.bfloat16) |
| | processor = AutoProcessor.from_pretrained("nvidia/Eagle-2.5-8B", trust_remote_code=True, use_fast=True) |
| | processor.tokenizer.padding_side = "left" |
| | |
| | messages1 = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "image", |
| | "image": "https://www.ilankelman.org/stopsigns/australia.jpg", |
| | }, |
| | {"type": "text", "text": "Describe this image."}, |
| | ], |
| | } |
| | ] |
| | |
| | messages2 = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | { |
| | "type": "image", |
| | "image": "https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png", |
| | }, |
| | {"type": "text", "text": "Describe this image."}, |
| | ], |
| | } |
| | ] |
| | |
| | text_list = [processor.apply_chat_template( |
| | messages, tokenize=False, add_generation_prompt=True |
| | ) for messages in [messages1, messages2]] |
| | image_inputs, video_inputs = processor.process_vision_info([messages1, messages2]) |
| | inputs = processor(text = text_list, images=image_inputs, videos=video_inputs, return_tensors="pt", padding=True) |
| | inputs = inputs.to("cuda") |
| | model = model.to("cuda") |
| | generated_ids = model.generate(**inputs, max_new_tokens=1024) |
| | output_text = processor.batch_decode( |
| | generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| | ) |
| | print(output_text) |
| | ``` |