Image-Text-to-Text
Transformers
TensorBoard
Safetensors
multilingual
internvl_chat
feature-extraction
internvl
custom_code
conversational
Instructions to use OpenGVLab/InternVL-Chat-V1-5 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use OpenGVLab/InternVL-Chat-V1-5 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="OpenGVLab/InternVL-Chat-V1-5", trust_remote_code=True) messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("OpenGVLab/InternVL-Chat-V1-5", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use OpenGVLab/InternVL-Chat-V1-5 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "OpenGVLab/InternVL-Chat-V1-5" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "OpenGVLab/InternVL-Chat-V1-5", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/OpenGVLab/InternVL-Chat-V1-5
- SGLang
How to use OpenGVLab/InternVL-Chat-V1-5 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "OpenGVLab/InternVL-Chat-V1-5" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "OpenGVLab/InternVL-Chat-V1-5", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "OpenGVLab/InternVL-Chat-V1-5" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "OpenGVLab/InternVL-Chat-V1-5", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use OpenGVLab/InternVL-Chat-V1-5 with Docker Model Runner:
docker model run hf.co/OpenGVLab/InternVL-Chat-V1-5
Upload folder using huggingface_hub
Browse files- modeling_internvl_chat.py +6 -11
modeling_internvl_chat.py
CHANGED
|
@@ -26,7 +26,7 @@ logger = logging.get_logger(__name__)
|
|
| 26 |
class InternVLChatModel(PreTrainedModel):
|
| 27 |
config_class = InternVLChatConfig
|
| 28 |
main_input_name = 'pixel_values'
|
| 29 |
-
_no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer']
|
| 30 |
|
| 31 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
|
| 32 |
super().__init__(config)
|
|
@@ -237,10 +237,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 237 |
raise NotImplementedError
|
| 238 |
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
| 239 |
self.img_context_token_id = img_context_token_id
|
| 240 |
-
if tokenizer.convert_tokens_to_ids('<|im_end|>') != 0:
|
| 241 |
-
eos_token_id = tokenizer.convert_tokens_to_ids('<|im_end|>') # 92542, InternLM2
|
| 242 |
-
else:
|
| 243 |
-
eos_token_id = tokenizer.eos_token_id
|
| 244 |
|
| 245 |
from .conversation import get_conv_template
|
| 246 |
|
|
@@ -259,6 +255,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 259 |
model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
|
| 260 |
input_ids = model_inputs['input_ids'].cuda()
|
| 261 |
attention_mask = model_inputs['attention_mask'].cuda()
|
|
|
|
| 262 |
generation_config['eos_token_id'] = eos_token_id
|
| 263 |
|
| 264 |
generation_output = self.generate(
|
|
@@ -268,7 +265,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 268 |
**generation_config
|
| 269 |
)
|
| 270 |
responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
|
| 271 |
-
responses = [response.split(
|
| 272 |
return responses
|
| 273 |
|
| 274 |
def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
|
|
@@ -276,10 +273,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 276 |
|
| 277 |
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
| 278 |
self.img_context_token_id = img_context_token_id
|
| 279 |
-
if tokenizer.convert_tokens_to_ids('<|im_end|>') != 0:
|
| 280 |
-
eos_token_id = tokenizer.convert_tokens_to_ids('<|im_end|>') # 92542, InternLM2
|
| 281 |
-
else:
|
| 282 |
-
eos_token_id = tokenizer.eos_token_id
|
| 283 |
|
| 284 |
from .conversation import get_conv_template
|
| 285 |
|
|
@@ -300,7 +293,9 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 300 |
model_inputs = tokenizer(query, return_tensors='pt')
|
| 301 |
input_ids = model_inputs['input_ids'].cuda()
|
| 302 |
attention_mask = model_inputs['attention_mask'].cuda()
|
|
|
|
| 303 |
generation_config['eos_token_id'] = eos_token_id
|
|
|
|
| 304 |
generation_output = self.generate(
|
| 305 |
pixel_values=pixel_values,
|
| 306 |
input_ids=input_ids,
|
|
@@ -308,7 +303,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
| 308 |
**generation_config
|
| 309 |
)
|
| 310 |
response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
|
| 311 |
-
response = response.split(
|
| 312 |
history.append((question, response))
|
| 313 |
if return_history:
|
| 314 |
return response, history
|
|
|
|
| 26 |
class InternVLChatModel(PreTrainedModel):
|
| 27 |
config_class = InternVLChatConfig
|
| 28 |
main_input_name = 'pixel_values'
|
| 29 |
+
_no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer', 'InternLM2DecoderLayer']
|
| 30 |
|
| 31 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
|
| 32 |
super().__init__(config)
|
|
|
|
| 237 |
raise NotImplementedError
|
| 238 |
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
| 239 |
self.img_context_token_id = img_context_token_id
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
from .conversation import get_conv_template
|
| 242 |
|
|
|
|
| 255 |
model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
|
| 256 |
input_ids = model_inputs['input_ids'].cuda()
|
| 257 |
attention_mask = model_inputs['attention_mask'].cuda()
|
| 258 |
+
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
|
| 259 |
generation_config['eos_token_id'] = eos_token_id
|
| 260 |
|
| 261 |
generation_output = self.generate(
|
|
|
|
| 265 |
**generation_config
|
| 266 |
)
|
| 267 |
responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
|
| 268 |
+
responses = [response.split(template.sep)[0].strip() for response in responses]
|
| 269 |
return responses
|
| 270 |
|
| 271 |
def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
|
|
|
|
| 273 |
|
| 274 |
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
| 275 |
self.img_context_token_id = img_context_token_id
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
from .conversation import get_conv_template
|
| 278 |
|
|
|
|
| 293 |
model_inputs = tokenizer(query, return_tensors='pt')
|
| 294 |
input_ids = model_inputs['input_ids'].cuda()
|
| 295 |
attention_mask = model_inputs['attention_mask'].cuda()
|
| 296 |
+
eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
|
| 297 |
generation_config['eos_token_id'] = eos_token_id
|
| 298 |
+
|
| 299 |
generation_output = self.generate(
|
| 300 |
pixel_values=pixel_values,
|
| 301 |
input_ids=input_ids,
|
|
|
|
| 303 |
**generation_config
|
| 304 |
)
|
| 305 |
response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
|
| 306 |
+
response = response.split(template.sep)[0].strip()
|
| 307 |
history.append((question, response))
|
| 308 |
if return_history:
|
| 309 |
return response, history
|