Instructions to use OpenGVLab/InternVL-Chat-V1-5-AWQ with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use OpenGVLab/InternVL-Chat-V1-5-AWQ with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="OpenGVLab/InternVL-Chat-V1-5-AWQ", trust_remote_code=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("OpenGVLab/InternVL-Chat-V1-5-AWQ", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use OpenGVLab/InternVL-Chat-V1-5-AWQ with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "OpenGVLab/InternVL-Chat-V1-5-AWQ"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "OpenGVLab/InternVL-Chat-V1-5-AWQ",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/OpenGVLab/InternVL-Chat-V1-5-AWQ

SGLang

How to use OpenGVLab/InternVL-Chat-V1-5-AWQ with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "OpenGVLab/InternVL-Chat-V1-5-AWQ" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "OpenGVLab/InternVL-Chat-V1-5-AWQ",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "OpenGVLab/InternVL-Chat-V1-5-AWQ" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "OpenGVLab/InternVL-Chat-V1-5-AWQ",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use OpenGVLab/InternVL-Chat-V1-5-AWQ with Docker Model Runner:
```
docker model run hf.co/OpenGVLab/InternVL-Chat-V1-5-AWQ
```

czczup commited on Nov 21, 2024

Commit

ee05c12

verified ·

1 Parent(s): 2a33216

fix compatibility issue for transformers 4.46+

Browse files

Files changed (3) hide show

configuration_intern_vit.py +1 -0
configuration_internvl_chat.py +3 -3
modeling_internvl_chat.py +5 -6

configuration_intern_vit.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # Copyright (c) 2024 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 import os
 from typing import Union

 # Copyright (c) 2024 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 import os
 from typing import Union

configuration_internvl_chat.py CHANGED Viewed

@@ -47,12 +47,12 @@ class InternVLChatConfig(PretrainedConfig):
             logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
         self.vision_config = InternVisionConfig(**vision_config)
-        if llm_config['architectures'][0] == 'LlamaForCausalLM':
             self.llm_config = LlamaConfig(**llm_config)
-        elif llm_config['architectures'][0] == 'InternLM2ForCausalLM':
             self.llm_config = InternLM2Config(**llm_config)
         else:
-            raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
         self.use_backbone_lora = use_backbone_lora
         self.use_llm_lora = use_llm_lora
         self.select_layer = select_layer

             logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
         self.vision_config = InternVisionConfig(**vision_config)
+        if llm_config.get(['architectures'])[0] == 'LlamaForCausalLM':
             self.llm_config = LlamaConfig(**llm_config)
+        elif llm_config.get(['architectures'])[0] == 'InternLM2ForCausalLM':
             self.llm_config = InternLM2Config(**llm_config)
         else:
+            raise ValueError('Unsupported architecture: {}'.format(llm_config.get(['architectures'])[0]))
         self.use_backbone_lora = use_backbone_lora
         self.use_llm_lora = use_llm_lora
         self.select_layer = select_layer

modeling_internvl_chat.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # Copyright (c) 2024 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 import warnings
 from typing import Any, List, Optional, Tuple, Union
@@ -236,7 +237,7 @@ class InternVLChatModel(PreTrainedModel):
         model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
         input_ids = model_inputs['input_ids'].to(self.device)
         attention_mask = model_inputs['attention_mask'].to(self.device)
-        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
         generation_config['eos_token_id'] = eos_token_id
         generation_output = self.generate(
             pixel_values=pixel_values,
@@ -245,7 +246,7 @@ class InternVLChatModel(PreTrainedModel):
             **generation_config
         )
         responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
-        responses = [response.split(template.sep)[0].strip() for response in responses]
         return responses
     def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
@@ -264,7 +265,7 @@ class InternVLChatModel(PreTrainedModel):
         template = get_conv_template(self.template)
         template.system_message = self.system_message
-        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
         history = [] if history is None else history
         for (old_question, old_answer) in history:
@@ -293,7 +294,7 @@ class InternVLChatModel(PreTrainedModel):
             **generation_config
         )
         response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
-        response = response.split(template.sep)[0].strip()
         history.append((question, response))
         if return_history:
             return response, history
@@ -313,7 +314,6 @@ class InternVLChatModel(PreTrainedModel):
             visual_features: Optional[torch.FloatTensor] = None,
             generation_config: Optional[GenerationConfig] = None,
             output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
             **generate_kwargs,
     ) -> torch.LongTensor:
@@ -341,7 +341,6 @@ class InternVLChatModel(PreTrainedModel):
             attention_mask=attention_mask,
             generation_config=generation_config,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
             use_cache=True,
             **generate_kwargs,
         )

 # Copyright (c) 2024 OpenGVLab
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 import warnings
 from typing import Any, List, Optional, Tuple, Union
         model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
         input_ids = model_inputs['input_ids'].to(self.device)
         attention_mask = model_inputs['attention_mask'].to(self.device)
+        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
         generation_config['eos_token_id'] = eos_token_id
         generation_output = self.generate(
             pixel_values=pixel_values,
             **generation_config
         )
         responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
+        responses = [response.split(template.sep.strip())[0].strip() for response in responses]
         return responses
     def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
         template = get_conv_template(self.template)
         template.system_message = self.system_message
+        eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
         history = [] if history is None else history
         for (old_question, old_answer) in history:
             **generation_config
         )
         response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
+        response = response.split(template.sep.strip())[0].strip()
         history.append((question, response))
         if return_history:
             return response, history
             visual_features: Optional[torch.FloatTensor] = None,
             generation_config: Optional[GenerationConfig] = None,
             output_hidden_states: Optional[bool] = None,
             **generate_kwargs,
     ) -> torch.LongTensor:
             attention_mask=attention_mask,
             generation_config=generation_config,
             output_hidden_states=output_hidden_states,
             use_cache=True,
             **generate_kwargs,
         )