Update license to MIT and clarify component licenses

#1
by nielsr HF Staff - opened
Files changed (1) hide show
  1. README.md +60 -35
README.md CHANGED
@@ -1,24 +1,24 @@
1
  ---
2
- license: apache-2.0
3
- pipeline_tag: image-text-to-text
4
- library_name: transformers
5
  base_model:
6
- - OpenGVLab/InternViT-300M-448px-V2_5
7
- - Qwen/Qwen3-0.6B
8
- base_model_relation: merge
9
  datasets:
10
- - OpenGVLab/MMPR-v1.2
11
- - OpenGVLab/MMPR-Tiny
12
  language:
13
- - multilingual
 
 
 
14
  tags:
15
- - internvl
16
- - custom_code
 
17
  ---
18
 
19
  # InternVL3_5-1B-Pretrained
20
 
21
- [\[📂 GitHub\]](https://github.com/OpenGVLab/InternVL) [\[📜 InternVL 1.0\]](https://huggingface.co/papers/2312.14238) [\[📜 InternVL 1.5\]](https://huggingface.co/papers/2404.16821) [\[📜 InternVL 2.5\]](https://huggingface.co/papers/2412.05271) [\[📜 InternVL2.5-MPO\]](https://huggingface.co/papers/2411.10442) [\[📜 InternVL3\]](https://huggingface.co/papers/2504.10479) [\[📜 InternVL3.5\]](https://huggingface.co/papers/2508.18265)
22
 
23
  [\[🆕 Blog\]](https://internvl.github.io/blog/) [\[🗨️ Chat Demo\]](https://chat.intern-ai.org.cn/) [\[🚀 Quick Start\]](#quick-start) [\[📖 Documents\]](https://internvl.readthedocs.io/en/latest/)
24
 
@@ -423,7 +423,7 @@ You are an AI assistant that rigorously follows this response protocol:
423
  Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
424
  """.strip()
425
 
426
- model.system_message = R1_SYSTEMP_PROMPT
427
  ```
428
 
429
  ### Inference with Transformers
@@ -530,40 +530,50 @@ generation_config = dict(max_new_tokens=1024, do_sample=True)
530
  # pure-text conversation (纯文本对话)
531
  question = 'Hello, who are you?'
532
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
533
- print(f'User: {question}\nAssistant: {response}')
 
534
 
535
  question = 'Can you tell me a story?'
536
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
537
- print(f'User: {question}\nAssistant: {response}')
 
538
 
539
  # single-image single-round conversation (单图单轮对话)
540
- question = '<image>\nPlease describe the image shortly.'
 
541
  response = model.chat(tokenizer, pixel_values, question, generation_config)
542
- print(f'User: {question}\nAssistant: {response}')
 
543
 
544
  # single-image multi-round conversation (单图多轮对话)
545
- question = '<image>\nPlease describe the image in detail.'
 
546
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
547
- print(f'User: {question}\nAssistant: {response}')
 
548
 
549
  question = 'Please write a poem according to the image.'
550
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
551
- print(f'User: {question}\nAssistant: {response}')
 
552
 
553
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
554
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
555
  pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
556
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
557
 
558
- question = '<image>\nDescribe the two images in detail.'
 
559
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
560
  history=None, return_history=True)
561
- print(f'User: {question}\nAssistant: {response}')
 
562
 
563
  question = 'What are the similarities and differences between these two images.'
564
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
565
  history=history, return_history=True)
566
- print(f'User: {question}\nAssistant: {response}')
 
567
 
568
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
569
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
@@ -571,17 +581,21 @@ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat1
571
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
572
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
573
 
574
- question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
 
 
575
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
576
  num_patches_list=num_patches_list,
577
  history=None, return_history=True)
578
- print(f'User: {question}\nAssistant: {response}')
 
579
 
580
  question = 'What are the similarities and differences between these two images.'
581
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
582
  num_patches_list=num_patches_list,
583
  history=history, return_history=True)
584
- print(f'User: {question}\nAssistant: {response}')
 
585
 
586
  # batch inference, single image per sample (单图批处理)
587
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
@@ -589,13 +603,15 @@ pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat1
589
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
590
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
591
 
592
- questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
 
593
  responses = model.batch_chat(tokenizer, pixel_values,
594
  num_patches_list=num_patches_list,
595
  questions=questions,
596
  generation_config=generation_config)
597
  for question, response in zip(questions, responses):
598
- print(f'User: {question}\nAssistant: {response}')
 
599
 
600
  # video multi-round conversation (视频多轮对话)
601
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
@@ -633,17 +649,24 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
633
  video_path = './examples/red-panda.mp4'
634
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
635
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
636
- video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
 
637
  question = video_prefix + 'What is the red panda doing?'
638
- # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
 
 
 
 
639
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
640
  num_patches_list=num_patches_list, history=None, return_history=True)
641
- print(f'User: {question}\nAssistant: {response}')
 
642
 
643
  question = 'Describe this video in detail.'
644
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
645
  num_patches_list=num_patches_list, history=history, return_history=True)
646
- print(f'User: {question}\nAssistant: {response}')
 
647
  ```
648
 
649
  #### Streaming Output
@@ -727,7 +750,9 @@ image_urls=[
727
 
728
  images = [load_image(img_url) for img_url in image_urls]
729
  # Numbering images improves multi-image conversations
730
- response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe these two images', images))
 
 
731
  print(response.text)
732
  ```
733
 
@@ -816,7 +841,7 @@ print(response)
816
 
817
  ## License
818
 
819
- This project is released under the apache-2.0 License. This project uses the pre-trained Qwen3 as a component, which is licensed under the apache-2.0 License.
820
 
821
  ## Citation
822
 
@@ -829,4 +854,4 @@ If you find this project useful in your research, please consider citing:
829
  journal={arXiv preprint arXiv:2508.18265},
830
  year={2025}
831
  }
832
- ```
 
1
  ---
 
 
 
2
  base_model:
3
+ - OpenGVLab/InternViT-300M-448px-V2_5
4
+ - Qwen/Qwen3-0.6B
 
5
  datasets:
6
+ - OpenGVLab/MMPR-v1.2
7
+ - OpenGVLab/MMPR-Tiny
8
  language:
9
+ - multilingual
10
+ library_name: transformers
11
+ license: mit
12
+ pipeline_tag: image-text-to-text
13
  tags:
14
+ - internvl
15
+ - custom_code
16
+ base_model_relation: merge
17
  ---
18
 
19
  # InternVL3_5-1B-Pretrained
20
 
21
+ [\[📂 GitHub\]](https://github.com/OpenGVLab/InternVL) [\[📜 InternVL 1.0\]](https://huggingface.co/papers/2312.14238) [\[📜 InternVL 1.5\\]](https://huggingface.co/papers/2404.16821) [\[📜 InternVL 2.5\]](https://huggingface.co/papers/2412.05271) [\[📜 InternVL2.5-MPO\]](https://huggingface.co/papers/2411.10442) [\[📜 InternVL3\]](https://huggingface.co/papers/2504.10479) [\[📜 InternVL3.5\]](https://huggingface.co/papers/2508.18265)
22
 
23
  [\[🆕 Blog\]](https://internvl.github.io/blog/) [\[🗨️ Chat Demo\]](https://chat.intern-ai.org.cn/) [\[🚀 Quick Start\]](#quick-start) [\[📖 Documents\]](https://internvl.readthedocs.io/en/latest/)
24
 
 
423
  Ensure that the thinking process is thorough but remains focused on the query. The final answer should be standalone and not reference the thinking section.
424
  """.strip()
425
 
426
+ model.system_message = R1_SYSTEM_PROMPT
427
  ```
428
 
429
  ### Inference with Transformers
 
530
  # pure-text conversation (纯文本对话)
531
  question = 'Hello, who are you?'
532
  response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
533
+ print(f'User: {question}
534
+ Assistant: {response}')
535
 
536
  question = 'Can you tell me a story?'
537
  response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
538
+ print(f'User: {question}
539
+ Assistant: {response}')
540
 
541
  # single-image single-round conversation (单图单轮对话)
542
+ question = '<image>
543
+ Please describe the image shortly.'
544
  response = model.chat(tokenizer, pixel_values, question, generation_config)
545
+ print(f'User: {question}
546
+ Assistant: {response}')
547
 
548
  # single-image multi-round conversation (单图多轮对话)
549
+ question = '<image>
550
+ Please describe the image in detail.'
551
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
552
+ print(f'User: {question}
553
+ Assistant: {response}')
554
 
555
  question = 'Please write a poem according to the image.'
556
  response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
557
+ print(f'User: {question}
558
+ Assistant: {response}')
559
 
560
  # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
561
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
562
  pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
563
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
564
 
565
+ question = '<image>
566
+ Describe the two images in detail.'
567
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
568
  history=None, return_history=True)
569
+ print(f'User: {question}
570
+ Assistant: {response}')
571
 
572
  question = 'What are the similarities and differences between these two images.'
573
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
574
  history=history, return_history=True)
575
+ print(f'User: {question}
576
+ Assistant: {response}')
577
 
578
  # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
579
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 
581
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
582
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
583
 
584
+ question = 'Image-1: <image>
585
+ Image-2: <image>
586
+ Describe the two images in detail.'
587
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
588
  num_patches_list=num_patches_list,
589
  history=None, return_history=True)
590
+ print(f'User: {question}
591
+ Assistant: {response}')
592
 
593
  question = 'What are the similarities and differences between these two images.'
594
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
595
  num_patches_list=num_patches_list,
596
  history=history, return_history=True)
597
+ print(f'User: {question}
598
+ Assistant: {response}')
599
 
600
  # batch inference, single image per sample (单图批处理)
601
  pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
 
603
  num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
604
  pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
605
 
606
+ questions = ['<image>
607
+ Describe the image in detail.'] * len(num_patches_list)
608
  responses = model.batch_chat(tokenizer, pixel_values,
609
  num_patches_list=num_patches_list,
610
  questions=questions,
611
  generation_config=generation_config)
612
  for question, response in zip(questions, responses):
613
+ print(f'User: {question}
614
+ Assistant: {response}')
615
 
616
  # video multi-round conversation (视频多轮对话)
617
  def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
 
649
  video_path = './examples/red-panda.mp4'
650
  pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
651
  pixel_values = pixel_values.to(torch.bfloat16).cuda()
652
+ video_prefix = ''.join([f'Frame{i+1}: <image>
653
+ ' for i in range(len(num_patches_list))])
654
  question = video_prefix + 'What is the red panda doing?'
655
+ # Frame1: <image>
656
+ Frame2: <image>
657
+ ...
658
+ Frame8: <image>
659
+ {question}
660
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
661
  num_patches_list=num_patches_list, history=None, return_history=True)
662
+ print(f'User: {question}
663
+ Assistant: {response}')
664
 
665
  question = 'Describe this video in detail.'
666
  response, history = model.chat(tokenizer, pixel_values, question, generation_config,
667
  num_patches_list=num_patches_list, history=history, return_history=True)
668
+ print(f'User: {question}
669
+ Assistant: {response}')
670
  ```
671
 
672
  #### Streaming Output
 
750
 
751
  images = [load_image(img_url) for img_url in image_urls]
752
  # Numbering images improves multi-image conversations
753
+ response = pipe((f'Image-1: {IMAGE_TOKEN}
754
+ Image-2: {IMAGE_TOKEN}
755
+ describe these two images', images))
756
  print(response.text)
757
  ```
758
 
 
841
 
842
  ## License
843
 
844
+ This project is released under the [MIT License](https://github.com/OpenGVLab/InternVL/blob/main/LICENSE). Parts of this project, such as the pre-trained Qwen3 component, are licensed under the Apache-2.0 License.
845
 
846
  ## Citation
847
 
 
854
  journal={arXiv preprint arXiv:2508.18265},
855
  year={2025}
856
  }
857
+ ```