| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """ |
| | Example: |
| | python scripts/vlm/mllama_generate.py --load_from_hf |
| | """ |
| |
|
| | import argparse |
| |
|
| | import requests |
| | import torch |
| | from megatron.core.inference.common_inference_params import CommonInferenceParams |
| | from PIL import Image |
| | from transformers import AutoProcessor |
| |
|
| | from nemo import lightning as nl |
| | from nemo.collections import vlm |
| | from nemo.collections.vlm.inference import generate as vlm_generate |
| | from nemo.collections.vlm.inference import setup_inference_wrapper |
| |
|
| | model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" |
| |
|
| |
|
| | def load_image(image_url: str) -> Image.Image: |
| | |
| | try: |
| | response = requests.get(image_url, stream=True) |
| | response.raise_for_status() |
| | image = Image.open(response.raw) |
| | return image |
| | except requests.exceptions.RequestException as e: |
| | print(f"Error loading image from {image_url}: {e}") |
| | return None |
| |
|
| |
|
| | def generate(model, processor, images, text, params): |
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [{"type": "text", "text": text}], |
| | } |
| | ] |
| | input_text = processor.apply_chat_template(messages, add_generation_prompt=True) |
| |
|
| | model = setup_inference_wrapper(model, processor.tokenizer) |
| |
|
| | prompts = [input_text] |
| | images = [images] |
| | result = vlm_generate( |
| | model, |
| | processor.tokenizer, |
| | processor.image_processor, |
| | prompts, |
| | images, |
| | inference_params=params, |
| | ) |
| |
|
| | generated_texts = list(result)[0].generated_text |
| |
|
| | if torch.distributed.get_rank() == 0: |
| | print("======== GENERATED TEXT OUTPUT ========") |
| | print(f"{generated_texts}") |
| | print("=======================================") |
| | return generated_texts |
| |
|
| |
|
| | def main(args) -> None: |
| | |
| | strategy = nl.MegatronStrategy( |
| | tensor_model_parallel_size=args.tp_size, |
| | ckpt_load_optimizer=False, |
| | ckpt_save_optimizer=False, |
| | ) |
| | trainer = nl.Trainer( |
| | devices=args.tp_size, |
| | max_steps=1000, |
| | accelerator="gpu", |
| | strategy=strategy, |
| | plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), |
| | val_check_interval=1000, |
| | limit_val_batches=50, |
| | ) |
| |
|
| | processor = AutoProcessor.from_pretrained(args.processor_name) |
| | tokenizer = processor.tokenizer |
| |
|
| | fabric = trainer.to_fabric() |
| |
|
| | if args.load_from_hf: |
| | model = fabric.import_model(f"hf://{model_id}", vlm.MLlamaModel) |
| | else: |
| | model = vlm.MLlamaModel(vlm.MLlamaConfig11BInstruct(), tokenizer=tokenizer) |
| | model = fabric.load_model(args.local_model_path, model) |
| |
|
| | |
| | raw_images = [load_image(url) for url in args.image_url] |
| | if not raw_images: |
| | return |
| |
|
| | params = CommonInferenceParams( |
| | temperature=args.temperature, |
| | top_p=args.top_p, |
| | top_k=args.top_k, |
| | num_tokens_to_generate=args.num_tokens_to_generate, |
| | ) |
| | generate(model, processor, images=raw_images, text=args.prompt, params=params) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser(description="") |
| | parser.add_argument( |
| | "--load_from_hf", |
| | action="store_true", |
| | help="Flag to indicate whether to load the model from Hugging Face hub.", |
| | ) |
| | parser.add_argument( |
| | "--local_model_path", |
| | type=str, |
| | default=None, |
| | help="Local path to the model if not loading from Hugging Face.", |
| | ) |
| | parser.add_argument( |
| | "--processor_name", |
| | type=str, |
| | default="meta-llama/Llama-3.2-11B-Vision-Instruct", |
| | help="Name or path of processor", |
| | ) |
| | parser.add_argument( |
| | "--prompt", |
| | type=str, |
| | default="<|image|>\nDescribe the image.", |
| | help="Input prompt", |
| | ) |
| | parser.add_argument( |
| | "--image_url", |
| | nargs='+', |
| | type=str, |
| | |
| | default=[ |
| | "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" |
| | ], |
| | help="List of the image urls to use for inference.", |
| | ) |
| | parser.add_argument( |
| | "--temperature", |
| | type=float, |
| | default=1.0, |
| | help="""Temperature to be used in megatron.core.inference.common_inference_params.CommonInferenceParams""", |
| | ) |
| | parser.add_argument( |
| | "--top_p", |
| | type=float, |
| | default=0.0, |
| | help="""top_p to be used in megatron.core.inference.common_inference_params.CommonInferenceParams""", |
| | ) |
| | parser.add_argument( |
| | "--top_k", |
| | type=int, |
| | default=1, |
| | help="""top_k to be used in megatron.core.inference.common_inference_params.CommonInferenceParams""", |
| | ) |
| | parser.add_argument( |
| | "--num_tokens_to_generate", |
| | type=int, |
| | default=50, |
| | help="""Number of tokens to generate per prompt""", |
| | ) |
| | parser.add_argument("--devices", type=int, required=False, default=1) |
| | parser.add_argument("--tp_size", type=int, required=False, default=1) |
| | parser.add_argument("--pp_size", type=int, required=False, default=1) |
| | parser.add_argument("--encoder_pp_size", type=int, required=False, default=0) |
| |
|
| | args = parser.parse_args() |
| | main(args) |
| |
|