Update license to MIT and clarify component licenses
#1
by
nielsr
HF Staff
- opened
README.md
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
pipeline_tag: image-text-to-text
|
| 4 |
-
library_name: transformers
|
| 5 |
base_model:
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
base_model_relation: merge
|
| 9 |
datasets:
|
| 10 |
-
|
| 11 |
-
|
| 12 |
language:
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
| 14 |
tags:
|
| 15 |
-
|
| 16 |
-
|
|
|
|
| 17 |
---
|
| 18 |
|
| 19 |
# InternVL3_5-1B-Pretrained
|
| 20 |
|
| 21 |
-
[\[📂 GitHub\]](https://github.com/OpenGVLab/InternVL) [\[📜 InternVL 1.0\]](https://huggingface.co/papers/2312.14238) [\[📜 InternVL 1.5
|
| 22 |
|
| 23 |
[\[🆕 Blog\]](https://internvl.github.io/blog/) [\[🗨️ Chat Demo\]](https://chat.intern-ai.org.cn/) [\[🚀 Quick Start\]](#quick-start) [\[📖 Documents\]](https://internvl.readthedocs.io/en/latest/)
|
| 24 |
|
|
@@ -423,7 +423,7 @@ You are an AI assistant that rigorously follows this response protocol:
|
|
| 423 |
Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
|
| 424 |
""".strip()
|
| 425 |
|
| 426 |
-
model.system_message =
|
| 427 |
```
|
| 428 |
|
| 429 |
### Inference with Transformers
|
|
@@ -530,40 +530,50 @@ generation_config = dict(max_new_tokens=1024, do_sample=True)
|
|
| 530 |
# pure-text conversation (纯文本对话)
|
| 531 |
question = 'Hello, who are you?'
|
| 532 |
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
| 533 |
-
print(f'User: {question}
|
|
|
|
| 534 |
|
| 535 |
question = 'Can you tell me a story?'
|
| 536 |
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
|
| 537 |
-
print(f'User: {question}
|
|
|
|
| 538 |
|
| 539 |
# single-image single-round conversation (单图单轮对话)
|
| 540 |
-
question = '<image
|
|
|
|
| 541 |
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
| 542 |
-
print(f'User: {question}
|
|
|
|
| 543 |
|
| 544 |
# single-image multi-round conversation (单图多轮对话)
|
| 545 |
-
question = '<image
|
|
|
|
| 546 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
| 547 |
-
print(f'User: {question}
|
|
|
|
| 548 |
|
| 549 |
question = 'Please write a poem according to the image.'
|
| 550 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
|
| 551 |
-
print(f'User: {question}
|
|
|
|
| 552 |
|
| 553 |
# multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
|
| 554 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 555 |
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 556 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 557 |
|
| 558 |
-
question = '<image
|
|
|
|
| 559 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 560 |
history=None, return_history=True)
|
| 561 |
-
print(f'User: {question}
|
|
|
|
| 562 |
|
| 563 |
question = 'What are the similarities and differences between these two images.'
|
| 564 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 565 |
history=history, return_history=True)
|
| 566 |
-
print(f'User: {question}
|
|
|
|
| 567 |
|
| 568 |
# multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
|
| 569 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
|
@@ -571,17 +581,21 @@ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat1
|
|
| 571 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 572 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 573 |
|
| 574 |
-
question = 'Image-1: <image
|
|
|
|
|
|
|
| 575 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 576 |
num_patches_list=num_patches_list,
|
| 577 |
history=None, return_history=True)
|
| 578 |
-
print(f'User: {question}
|
|
|
|
| 579 |
|
| 580 |
question = 'What are the similarities and differences between these two images.'
|
| 581 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 582 |
num_patches_list=num_patches_list,
|
| 583 |
history=history, return_history=True)
|
| 584 |
-
print(f'User: {question}
|
|
|
|
| 585 |
|
| 586 |
# batch inference, single image per sample (单图批处理)
|
| 587 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
|
@@ -589,13 +603,15 @@ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat1
|
|
| 589 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 590 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 591 |
|
| 592 |
-
questions = ['<image
|
|
|
|
| 593 |
responses = model.batch_chat(tokenizer, pixel_values,
|
| 594 |
num_patches_list=num_patches_list,
|
| 595 |
questions=questions,
|
| 596 |
generation_config=generation_config)
|
| 597 |
for question, response in zip(questions, responses):
|
| 598 |
-
print(f'User: {question}
|
|
|
|
| 599 |
|
| 600 |
# video multi-round conversation (视频多轮对话)
|
| 601 |
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
|
|
@@ -633,17 +649,24 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
|
|
| 633 |
video_path = './examples/red-panda.mp4'
|
| 634 |
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
|
| 635 |
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
| 636 |
-
video_prefix = ''.join([f'Frame{i+1}: <image
|
|
|
|
| 637 |
question = video_prefix + 'What is the red panda doing?'
|
| 638 |
-
# Frame1: <image
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 640 |
num_patches_list=num_patches_list, history=None, return_history=True)
|
| 641 |
-
print(f'User: {question}
|
|
|
|
| 642 |
|
| 643 |
question = 'Describe this video in detail.'
|
| 644 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 645 |
num_patches_list=num_patches_list, history=history, return_history=True)
|
| 646 |
-
print(f'User: {question}
|
|
|
|
| 647 |
```
|
| 648 |
|
| 649 |
#### Streaming Output
|
|
@@ -727,7 +750,9 @@ image_urls=[
|
|
| 727 |
|
| 728 |
images = [load_image(img_url) for img_url in image_urls]
|
| 729 |
# Numbering images improves multi-image conversations
|
| 730 |
-
response = pipe((f'Image-1: {IMAGE_TOKEN}
|
|
|
|
|
|
|
| 731 |
print(response.text)
|
| 732 |
```
|
| 733 |
|
|
@@ -816,7 +841,7 @@ print(response)
|
|
| 816 |
|
| 817 |
## License
|
| 818 |
|
| 819 |
-
This project is released under the
|
| 820 |
|
| 821 |
## Citation
|
| 822 |
|
|
@@ -829,4 +854,4 @@ If you find this project useful in your research, please consider citing:
|
|
| 829 |
journal={arXiv preprint arXiv:2508.18265},
|
| 830 |
year={2025}
|
| 831 |
}
|
| 832 |
-
```
|
|
|
|
| 1 |
---
|
|
|
|
|
|
|
|
|
|
| 2 |
base_model:
|
| 3 |
+
- OpenGVLab/InternViT-300M-448px-V2_5
|
| 4 |
+
- Qwen/Qwen3-0.6B
|
|
|
|
| 5 |
datasets:
|
| 6 |
+
- OpenGVLab/MMPR-v1.2
|
| 7 |
+
- OpenGVLab/MMPR-Tiny
|
| 8 |
language:
|
| 9 |
+
- multilingual
|
| 10 |
+
library_name: transformers
|
| 11 |
+
license: mit
|
| 12 |
+
pipeline_tag: image-text-to-text
|
| 13 |
tags:
|
| 14 |
+
- internvl
|
| 15 |
+
- custom_code
|
| 16 |
+
base_model_relation: merge
|
| 17 |
---
|
| 18 |
|
| 19 |
# InternVL3_5-1B-Pretrained
|
| 20 |
|
| 21 |
+
[\[📂 GitHub\]](https://github.com/OpenGVLab/InternVL) [\[📜 InternVL 1.0\]](https://huggingface.co/papers/2312.14238) [\[📜 InternVL 1.5\\]](https://huggingface.co/papers/2404.16821) [\[📜 InternVL 2.5\]](https://huggingface.co/papers/2412.05271) [\[📜 InternVL2.5-MPO\]](https://huggingface.co/papers/2411.10442) [\[📜 InternVL3\]](https://huggingface.co/papers/2504.10479) [\[📜 InternVL3.5\]](https://huggingface.co/papers/2508.18265)
|
| 22 |
|
| 23 |
[\[🆕 Blog\]](https://internvl.github.io/blog/) [\[🗨️ Chat Demo\]](https://chat.intern-ai.org.cn/) [\[🚀 Quick Start\]](#quick-start) [\[📖 Documents\]](https://internvl.readthedocs.io/en/latest/)
|
| 24 |
|
|
|
|
| 423 |
Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
|
| 424 |
""".strip()
|
| 425 |
|
| 426 |
+
model.system_message = R1_SYSTEM_PROMPT
|
| 427 |
```
|
| 428 |
|
| 429 |
### Inference with Transformers
|
|
|
|
| 530 |
# pure-text conversation (纯文本对话)
|
| 531 |
question = 'Hello, who are you?'
|
| 532 |
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
| 533 |
+
print(f'User: {question}
|
| 534 |
+
Assistant: {response}')
|
| 535 |
|
| 536 |
question = 'Can you tell me a story?'
|
| 537 |
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
|
| 538 |
+
print(f'User: {question}
|
| 539 |
+
Assistant: {response}')
|
| 540 |
|
| 541 |
# single-image single-round conversation (单图单轮对话)
|
| 542 |
+
question = '<image>
|
| 543 |
+
Please describe the image shortly.'
|
| 544 |
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
| 545 |
+
print(f'User: {question}
|
| 546 |
+
Assistant: {response}')
|
| 547 |
|
| 548 |
# single-image multi-round conversation (单图多轮对话)
|
| 549 |
+
question = '<image>
|
| 550 |
+
Please describe the image in detail.'
|
| 551 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
| 552 |
+
print(f'User: {question}
|
| 553 |
+
Assistant: {response}')
|
| 554 |
|
| 555 |
question = 'Please write a poem according to the image.'
|
| 556 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
|
| 557 |
+
print(f'User: {question}
|
| 558 |
+
Assistant: {response}')
|
| 559 |
|
| 560 |
# multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
|
| 561 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 562 |
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 563 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 564 |
|
| 565 |
+
question = '<image>
|
| 566 |
+
Describe the two images in detail.'
|
| 567 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 568 |
history=None, return_history=True)
|
| 569 |
+
print(f'User: {question}
|
| 570 |
+
Assistant: {response}')
|
| 571 |
|
| 572 |
question = 'What are the similarities and differences between these two images.'
|
| 573 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 574 |
history=history, return_history=True)
|
| 575 |
+
print(f'User: {question}
|
| 576 |
+
Assistant: {response}')
|
| 577 |
|
| 578 |
# multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
|
| 579 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
|
|
|
| 581 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 582 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 583 |
|
| 584 |
+
question = 'Image-1: <image>
|
| 585 |
+
Image-2: <image>
|
| 586 |
+
Describe the two images in detail.'
|
| 587 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 588 |
num_patches_list=num_patches_list,
|
| 589 |
history=None, return_history=True)
|
| 590 |
+
print(f'User: {question}
|
| 591 |
+
Assistant: {response}')
|
| 592 |
|
| 593 |
question = 'What are the similarities and differences between these two images.'
|
| 594 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 595 |
num_patches_list=num_patches_list,
|
| 596 |
history=history, return_history=True)
|
| 597 |
+
print(f'User: {question}
|
| 598 |
+
Assistant: {response}')
|
| 599 |
|
| 600 |
# batch inference, single image per sample (单图批处理)
|
| 601 |
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
|
|
|
| 603 |
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 604 |
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 605 |
|
| 606 |
+
questions = ['<image>
|
| 607 |
+
Describe the image in detail.'] * len(num_patches_list)
|
| 608 |
responses = model.batch_chat(tokenizer, pixel_values,
|
| 609 |
num_patches_list=num_patches_list,
|
| 610 |
questions=questions,
|
| 611 |
generation_config=generation_config)
|
| 612 |
for question, response in zip(questions, responses):
|
| 613 |
+
print(f'User: {question}
|
| 614 |
+
Assistant: {response}')
|
| 615 |
|
| 616 |
# video multi-round conversation (视频多轮对话)
|
| 617 |
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
|
|
|
|
| 649 |
video_path = './examples/red-panda.mp4'
|
| 650 |
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
|
| 651 |
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
| 652 |
+
video_prefix = ''.join([f'Frame{i+1}: <image>
|
| 653 |
+
' for i in range(len(num_patches_list))])
|
| 654 |
question = video_prefix + 'What is the red panda doing?'
|
| 655 |
+
# Frame1: <image>
|
| 656 |
+
Frame2: <image>
|
| 657 |
+
...
|
| 658 |
+
Frame8: <image>
|
| 659 |
+
{question}
|
| 660 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 661 |
num_patches_list=num_patches_list, history=None, return_history=True)
|
| 662 |
+
print(f'User: {question}
|
| 663 |
+
Assistant: {response}')
|
| 664 |
|
| 665 |
question = 'Describe this video in detail.'
|
| 666 |
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 667 |
num_patches_list=num_patches_list, history=history, return_history=True)
|
| 668 |
+
print(f'User: {question}
|
| 669 |
+
Assistant: {response}')
|
| 670 |
```
|
| 671 |
|
| 672 |
#### Streaming Output
|
|
|
|
| 750 |
|
| 751 |
images = [load_image(img_url) for img_url in image_urls]
|
| 752 |
# Numbering images improves multi-image conversations
|
| 753 |
+
response = pipe((f'Image-1: {IMAGE_TOKEN}
|
| 754 |
+
Image-2: {IMAGE_TOKEN}
|
| 755 |
+
describe these two images', images))
|
| 756 |
print(response.text)
|
| 757 |
```
|
| 758 |
|
|
|
|
| 841 |
|
| 842 |
## License
|
| 843 |
|
| 844 |
+
This project is released under the [MIT License](https://github.com/OpenGVLab/InternVL/blob/main/LICENSE). Parts of this project, such as the pre-trained Qwen3 component, are licensed under the Apache-2.0 License.
|
| 845 |
|
| 846 |
## Citation
|
| 847 |
|
|
|
|
| 854 |
journal={arXiv preprint arXiv:2508.18265},
|
| 855 |
year={2025}
|
| 856 |
}
|
| 857 |
+
```
|