|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Example: |
|
|
python scripts/vlm/neva_generate.py --load_from_hf |
|
|
python scripts/vlm/neva_generate.py --local_model_path <PATH_TO_MODEL> --enable_quantization |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
|
|
|
import requests |
|
|
import torch |
|
|
from megatron.core.inference.common_inference_params import CommonInferenceParams |
|
|
from PIL import Image |
|
|
from transformers import AutoProcessor |
|
|
|
|
|
import nemo.lightning as nl |
|
|
from nemo.collections.vlm import Llava15Config7B, LlavaModel |
|
|
from nemo.collections.vlm.inference import generate as vlm_generate |
|
|
from nemo.collections.vlm.inference import setup_inference_wrapper |
|
|
from nemo.utils import logging |
|
|
|
|
|
try: |
|
|
import modelopt.torch.quantization as mtq |
|
|
from megatron.core.post_training.modelopt.gpt.model_specs import get_gpt_modelopt_spec |
|
|
|
|
|
HAVE_MODELOPT = True |
|
|
|
|
|
except (ImportError, ModuleNotFoundError): |
|
|
|
|
|
HAVE_MODELOPT = False |
|
|
|
|
|
|
|
|
def load_image(image_url: str) -> Image.Image: |
|
|
|
|
|
try: |
|
|
response = requests.get(image_url, stream=True) |
|
|
response.raise_for_status() |
|
|
image = Image.open(response.raw) |
|
|
return image |
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f"Error loading image from {image_url}: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def generate(model, processor, images, text, params): |
|
|
|
|
|
conversation = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": text}, |
|
|
{"type": "image"}, |
|
|
], |
|
|
}, |
|
|
] |
|
|
input_text = processor.apply_chat_template(conversation, add_generation_prompt=True) |
|
|
|
|
|
class NevaTokenizer: |
|
|
|
|
|
def __init__(self, tokenizer): |
|
|
self._tokenizer = tokenizer |
|
|
self.vocab_size = tokenizer.vocab_size |
|
|
self.eos_token_id = tokenizer.eos_token_id |
|
|
|
|
|
def decode(self, tokens, **kwargs): |
|
|
modified_tokens = [] |
|
|
for x in tokens: |
|
|
if x == -200: |
|
|
modified_tokens.append(0) |
|
|
elif x != 1: |
|
|
modified_tokens.append(x) |
|
|
return self._tokenizer.decode(modified_tokens, skip_special_tokens=False) |
|
|
|
|
|
def encode(self, prompt, **kwargs): |
|
|
prompts_tokens = self._tokenizer.encode(prompt, add_special_tokens=True) |
|
|
return [-200 if x == 32000 else x for x in prompts_tokens] |
|
|
|
|
|
model = setup_inference_wrapper(model, processor.tokenizer) |
|
|
|
|
|
prompts = [input_text] |
|
|
images = [images] |
|
|
result = vlm_generate( |
|
|
model, |
|
|
NevaTokenizer(processor.tokenizer), |
|
|
processor.image_processor, |
|
|
prompts, |
|
|
images, |
|
|
inference_params=params, |
|
|
) |
|
|
|
|
|
generated_texts = list(result)[0].generated_text |
|
|
|
|
|
if torch.distributed.get_rank() == 0: |
|
|
print("======== GENERATED TEXT OUTPUT ========") |
|
|
print(f"{generated_texts}") |
|
|
print("=======================================") |
|
|
|
|
|
return generated_texts |
|
|
|
|
|
|
|
|
def legacy_generate(model, processor, raw_image, text, num_tokens_to_generate): |
|
|
|
|
|
conversation = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": text}, |
|
|
{"type": "image"}, |
|
|
], |
|
|
}, |
|
|
] |
|
|
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) |
|
|
hf_tokenizer = processor.tokenizer |
|
|
|
|
|
inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16) |
|
|
input_ids = hf_tokenizer(prompt, return_tensors='pt')['input_ids'].cuda() |
|
|
input_ids[input_ids == 32000] = -200 |
|
|
images = inputs['pixel_values'].cuda() |
|
|
images = images.reshape(images.size(0), 3, 336, 336) |
|
|
|
|
|
position_ids = ( |
|
|
torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device).unsqueeze(0).expand_as(input_ids) |
|
|
) |
|
|
|
|
|
model = model.module.cuda() |
|
|
model.eval() |
|
|
generated_ids = input_ids.clone() |
|
|
|
|
|
|
|
|
for _ in range(num_tokens_to_generate): |
|
|
with torch.no_grad(): |
|
|
output = model( |
|
|
images=images, |
|
|
input_ids=input_ids, |
|
|
position_ids=position_ids, |
|
|
attention_mask=None, |
|
|
) |
|
|
|
|
|
next_token_ids = torch.argmax(output[:, -1], dim=-1, keepdim=True) |
|
|
|
|
|
generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1) |
|
|
|
|
|
input_ids = generated_ids |
|
|
position_ids = ( |
|
|
torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device) |
|
|
.unsqueeze(0) |
|
|
.expand_as(input_ids) |
|
|
) |
|
|
|
|
|
|
|
|
if next_token_ids.item() == hf_tokenizer.eos_token_id: |
|
|
break |
|
|
|
|
|
generated_ids[generated_ids == -200] = 0 |
|
|
generated_texts = hf_tokenizer.batch_decode(generated_ids, skip_special_tokens=False) |
|
|
logging.info("======== GENERATED TEXT OUTPUT ========") |
|
|
logging.info(f"{generated_texts}") |
|
|
logging.info("=======================================") |
|
|
|
|
|
|
|
|
def main(args) -> None: |
|
|
|
|
|
strategy = nl.MegatronStrategy( |
|
|
tensor_model_parallel_size=1, |
|
|
ckpt_include_optimizer=False, |
|
|
) |
|
|
trainer = nl.Trainer( |
|
|
devices=1, |
|
|
max_steps=1000, |
|
|
accelerator="gpu", |
|
|
strategy=strategy, |
|
|
plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), |
|
|
val_check_interval=1000, |
|
|
limit_val_batches=50, |
|
|
) |
|
|
|
|
|
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") |
|
|
hf_tokenizer = processor.tokenizer |
|
|
|
|
|
|
|
|
raw_image = load_image(args.image_url) |
|
|
if raw_image is None: |
|
|
return |
|
|
|
|
|
fabric = trainer.to_fabric() |
|
|
|
|
|
|
|
|
if args.load_from_hf: |
|
|
model = fabric.import_model("hf://llava-hf/llava-1.5-7b-hf", LlavaModel) |
|
|
else: |
|
|
config = Llava15Config7B() |
|
|
if args.enable_quantization: |
|
|
new_transformer_layer_spec = get_gpt_modelopt_spec( |
|
|
config.language_transformer_config, local_core_attention=False, remap_te_layernorm=True |
|
|
) |
|
|
config.language_transformer_config.transformer_layer_spec = new_transformer_layer_spec |
|
|
model = LlavaModel(config, tokenizer=hf_tokenizer) |
|
|
model = fabric.load_model(args.local_model_path, model) |
|
|
|
|
|
params = CommonInferenceParams( |
|
|
temperature=args.temperature, |
|
|
top_p=args.top_p, |
|
|
top_k=args.top_k, |
|
|
num_tokens_to_generate=args.num_tokens_to_generate, |
|
|
) |
|
|
if args.legacy_generate: |
|
|
legacy_generate(model, processor, raw_image, args.prompt, args.num_tokens_to_generate) |
|
|
else: |
|
|
generate(model, processor, images=raw_image, text=args.prompt, params=params) |
|
|
|
|
|
if args.enable_quantization: |
|
|
base_img_url = "http://images.cocodataset.org/val2017/" |
|
|
images = [ |
|
|
"000000039769.jpg", |
|
|
"000000002685.jpg", |
|
|
"000000004495.jpg", |
|
|
"000000005001.jpg", |
|
|
"000000003845.jpg", |
|
|
"000000011615.jpg", |
|
|
"000000010977.jpg", |
|
|
"000000010764.jpg", |
|
|
"000000010707.jpg", |
|
|
"000000010583.jpg", |
|
|
"000000010363.jpg", |
|
|
"000000010092.jpg", |
|
|
"000000009914.jpg", |
|
|
"000000009891.jpg", |
|
|
"000000009769.jpg", |
|
|
"000000009590.jpg", |
|
|
"000000009483.jpg", |
|
|
"000000009448.jpg", |
|
|
"000000009378.jpg", |
|
|
"000000008899.jpg", |
|
|
] |
|
|
quantization_images_url = [base_img_url + img_id for img_id in images] |
|
|
|
|
|
def forward_loop(): |
|
|
for img_url in quantization_images_url: |
|
|
raw_image = load_image(img_url) |
|
|
response = generate( |
|
|
model, processor, images=raw_image, text="can you describe this image?", params=params |
|
|
) |
|
|
print(img_url, "->", response) |
|
|
|
|
|
|
|
|
|
|
|
if args.quant_alg == "int8_sq": |
|
|
mtq_config = mtq.INT8_SMOOTHQUANT_CFG |
|
|
elif args.quant_alg == "fp8": |
|
|
mtq_config = mtq.FP8_DEFAULT_CFG |
|
|
elif args.quant_alg == "awq": |
|
|
mtq_config = mtq.INT4_AWQ_CFG |
|
|
else: |
|
|
raise ValueError(f"Unsupported quantization algorithm: {args.quantization.algorithm}") |
|
|
|
|
|
logging.info("-------- Start Quantization --------") |
|
|
mtq.quantize(model, mtq_config, forward_loop) |
|
|
logging.info("-------- End Quantization --------") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser(description="LLaVA Multimodal Inference") |
|
|
parser.add_argument( |
|
|
"--load_from_hf", |
|
|
action="store_true", |
|
|
help="Flag to indicate whether to load the model from Hugging Face hub.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--local_model_path", |
|
|
type=str, |
|
|
default=None, |
|
|
help="Local path to the model if not loading from Hugging Face.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--image_url", |
|
|
type=str, |
|
|
default="http://images.cocodataset.org/val2017/000000039769.jpg", |
|
|
help="URL of the image to use for inference.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--prompt", |
|
|
type=str, |
|
|
default="What are these?", |
|
|
help="Input prompt", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--temperature", |
|
|
type=float, |
|
|
default=1.0, |
|
|
help="""Temperature to be used in megatron.core.inference.common_inference_params.CommonInferenceParams""", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--top_p", |
|
|
type=float, |
|
|
default=0.0, |
|
|
help="""top_p to be used in megatron.core.inference.common_inference_params.CommonInferenceParams""", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--top_k", |
|
|
type=int, |
|
|
default=1, |
|
|
help="""top_k to be used in megatron.core.inference.common_inference_params.CommonInferenceParams""", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--num_tokens_to_generate", |
|
|
type=int, |
|
|
default=20, |
|
|
help="""Number of tokens to generate per prompt""", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--legacy_generate", |
|
|
action="store_true", |
|
|
help="Flag to indicate whether to use legacy generation function.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--enable_quantization", |
|
|
action="store_true", |
|
|
help="Flag to indicate whether to enable quantization.", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--quant_alg", |
|
|
type=str, |
|
|
default="fp8", |
|
|
help="Input prompt", |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
main(args) |
|
|
|