| { | |
| "bomFormat": "CycloneDX", | |
| "specVersion": "1.6", | |
| "serialNumber": "urn:uuid:0fd73414-9e65-47c5-8598-9745c0e73210", | |
| "version": 1, | |
| "metadata": { | |
| "timestamp": "2025-06-05T09:41:23.012895+00:00", | |
| "component": { | |
| "type": "machine-learning-model", | |
| "bom-ref": "OpenGVLab/InternVL-Chat-V1-5-c62ebeb8-2b6a-572f-945a-ebc349c329f4", | |
| "name": "OpenGVLab/InternVL-Chat-V1-5", | |
| "externalReferences": [ | |
| { | |
| "url": "https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5", | |
| "type": "documentation" | |
| } | |
| ], | |
| "modelCard": { | |
| "modelParameters": { | |
| "task": "image-text-to-text", | |
| "architectureFamily": "internvl_chat", | |
| "modelArchitecture": "InternVLChatModel" | |
| }, | |
| "properties": [ | |
| { | |
| "name": "library_name", | |
| "value": "transformers" | |
| }, | |
| { | |
| "name": "base_model", | |
| "value": "OpenGVLab/InternViT-6B-448px-V1-5, internlm/internlm2-chat-20b" | |
| }, | |
| { | |
| "name": "base_model_relation", | |
| "value": "merge" | |
| } | |
| ] | |
| }, | |
| "authors": [ | |
| { | |
| "name": "OpenGVLab" | |
| } | |
| ], | |
| "licenses": [ | |
| { | |
| "license": { | |
| "id": "MIT", | |
| "url": "https://spdx.org/licenses/MIT.html" | |
| } | |
| } | |
| ], | |
| "description": "<p align=\"center\"><img src=\"https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/D60YzQBIzvoCvLRp2gZ0A.jpeg\" alt=\"Image Description\" width=\"300\" height=\"300\"></p>> _Two interns holding hands, symbolizing the integration of InternViT and InternLM._We introduce InternVL 1.5, an open-source multimodal large language model (MLLM) to bridge the capability gap between open-source and proprietary commercial models in multimodal understanding.We introduce three simple designs:1. **Strong Vision Encoder:** we explored a continuous learning strategy for the large-scale vision foundation model---InternViT-6B, boosting its visual understanding capabilities, and making it can be transferred and reused in different LLMs.2. **Dynamic High-Resolution:** we divide images into tiles ranging from 1 to 40 of 448 \u00d7 448 pixels according to the aspect ratio and resolution of the input images, which supports up to 4K resolution input during inference.3. **High-Quality Bilingual Dataset:** we carefully collected a high-quality bilingual dataset that covers common scenes, document images, and annotated them with English and Chinese question-answer pairs, significantly enhancing performance in OCR- and Chinese-related tasks.", | |
| "tags": [ | |
| "transformers", | |
| "tensorboard", | |
| "safetensors", | |
| "internvl_chat", | |
| "feature-extraction", | |
| "internvl", | |
| "custom_code", | |
| "image-text-to-text", | |
| "conversational", | |
| "multilingual", | |
| "arxiv:2312.14238", | |
| "arxiv:2404.16821", | |
| "arxiv:2410.16261", | |
| "arxiv:2412.05271", | |
| "base_model:OpenGVLab/InternViT-6B-448px-V1-5", | |
| "base_model:merge:OpenGVLab/InternViT-6B-448px-V1-5", | |
| "base_model:internlm/internlm2-chat-20b", | |
| "base_model:merge:internlm/internlm2-chat-20b", | |
| "license:mit", | |
| "region:us" | |
| ] | |
| } | |
| } | |
| } |