TIGER-Lab
/

MAmmoTH-VL2

@@ -18,6 +18,108 @@ MAmmoTH-VL2, the model trained with VisualWebInstruct.
 [Paper](https://arxiv.org/abs/2503.10582)|
 [Website](https://tiger-ai-lab.github.io/VisualWebInstruct/)
 # Citation
 ```
 @article{visualwebinstruct,

 [Paper](https://arxiv.org/abs/2503.10582)|
 [Website](https://tiger-ai-lab.github.io/VisualWebInstruct/)
+# Example Usage
+To perform inference using MAmmoTH-VL2, you can use the following code snippet:
+```python
+# pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import process_images
+from llava.constants import DEFAULT_IMAGE_TOKEN
+from llava.conversation import conv_templates
+from PIL import Image
+import requests
+import copy
+import torch
+# Load MAmmoTH-VL2 model
+pretrained = "TIGER-Lab/MAmmoTH-VL2"
+model_name = "llava_qwen"
+device = "cuda:3"  # Specify a single GPU
+device_map = {"": device}
+# Load model
+tokenizer, model, image_processor, max_length = load_pretrained_model(
+    pretrained,
+    None,
+    model_name,
+    device_map=device_map,
+    multimodal=True
+)
+model.eval()
+model = model.to(device)
+# Load image
+image_url = "https://raw.githubusercontent.com/jymmmmm/VISUALWEBINSTRUCT/main/image.png"
+image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
+images = [image]
+image_sizes = [[image.size[0], image.size[1]]]
+# Prepare prompt
+prompt = "In the picture shown below, prove ΔWXY and ΔZWY are similar. Please conclude your answer as Answer: xxx at the end if possible."
+# Set up conversation template
+try:
+    conv_template = "qwen_2_5"
+    conv = copy.deepcopy(conv_templates[conv_template])
+except KeyError:
+    available_templates = list(conv_templates.keys())
+    for template_name in available_templates:
+        if 'qwen' in template_name.lower():
+            conv_template = template_name
+            break
+    else:
+        conv_template = available_templates[0]
+    conv = copy.deepcopy(conv_templates[conv_template])
+# Add question with image
+question = DEFAULT_IMAGE_TOKEN + "\n" + prompt
+conv.append_message(conv.roles[0], question)
+conv.append_message(conv.roles[1], None)
+prompt_question = conv.get_prompt()
+# Prepare model inputs
+inputs = tokenizer(
+    prompt_question,
+    return_tensors="pt",
+    padding=True,
+    truncation=True,
+    max_length=max_length
+)
+input_ids = inputs.input_ids.to(device)
+attention_mask = inputs.attention_mask.to(device)
+# Process image
+image_tensor = process_images(images, image_processor, model.config)
+if isinstance(image_tensor, list):
+    image_tensor = [img.to(dtype=torch.float16, device=device) for img in image_tensor]
+else:
+    image_tensor = image_tensor.to(dtype=torch.float16, device=device)
+# Generate response
+with torch.no_grad():
+    outputs = model.generate(
+        input_ids,
+        attention_mask=attention_mask,
+        images=image_tensor,
+        image_sizes=image_sizes,
+        do_sample=False,
+        temperature=0,
+        max_new_tokens=512,
+    )
+# Decode response
+input_token_len = input_ids.shape[1]
+response = tokenizer.batch_decode(outputs[:, input_token_len:], skip_special_tokens=True)[0]
+print("Response:", response)
+```
 # Citation
 ```
 @article{visualwebinstruct,