Use transformers as the library name

by ariG23498 HF Staff - opened Oct 23, 2025

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

-97

Files changed (3) hide show

README.md +7 -95
llm/config.json +1 -1
sound_mm_projector/config.json +1 -1

README.md CHANGED Viewed

@@ -1,98 +1,10 @@
 ---
 library_name: transformers
-license: apache-2.0
-tags:
-- omni-modal
-- multimodal
-- vision
-- audio
-- video
-- llm
-model-index:
-- name: OmniVinci
-  results:
-  - task:
-      type: image-to-text
-      name: Image Understanding
-    dataset:
-      name: MVBench
-      type: mvbench
-    metrics:
-    - name: MVBench Score
-      type: accuracy
-      value: 70.6
-      source:
-        name: OmniVinci Technical Report
-        url: https://arxiv.org/abs/2510.15870
-  - task:
-      type: video-to-text
-      name: Video Understanding
-    dataset:
-      name: Video-MME
-      type: video-mme
-    metrics:
-    - name: Video-MME (w/o sub)
-      type: accuracy
-      value: 68.2
-      source:
-        name: OmniVinci Technical Report
-        url: https://arxiv.org/abs/2510.15870
-  - task:
-      type: video-to-text
-      name: Cross-Modal Understanding
-    dataset:
-      name: DailyOmni
-      type: dailyomni
-    metrics:
-    - name: DailyOmni Score
-      type: accuracy
-      value: 66.5
-      source:
-        name: OmniVinci Technical Report
-        url: https://arxiv.org/abs/2510.15870
-  - task:
-      type: audio-to-text
-      name: Audio Understanding
-    dataset:
-      name: MMAR
-      type: mmar
-    metrics:
-    - name: MMAR Score
-      type: accuracy
-      value: 58.4
-      source:
-        name: OmniVinci Technical Report
-        url: https://arxiv.org/abs/2510.15870
-  - task:
-      type: audio-to-text
-      name: Audio-Only Reasoning
-    dataset:
-      name: MMAU
-      type: mmau
-    metrics:
-    - name: MMAU Score
-      type: accuracy
-      value: 71.6
-      source:
-        name: OmniVinci Technical Report
-        url: https://arxiv.org/abs/2510.15870
-  - task:
-      type: video-to-text
-      name: Multi-Modal Reasoning
-    dataset:
-      name: Worldsense
-      type: worldsense
-    metrics:
-    - name: Worldsense Score
-      type: accuracy
-      value: 48.2
-      source:
-        name: OmniVinci Technical Report
-        url: https://arxiv.org/abs/2510.15870
 ---
 # <span style="background: linear-gradient(45deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #f5576c 75%, #4facfe 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; font-weight: bold; font-size: 1.1em;">**OmniVinci: Enhancing Architecture and Data for Omni-Modal Understanding LLM**</span> <br />
-[![Paper](https://img.shields.io/badge/ArXiv-Paper-brown)](https://arxiv.org/abs/2510.15870)
 [![Code](https://img.shields.io/badge/GitHub-Link-blue)](https://github.com/NVlabs/OmniVinci)
 [![Model](https://img.shields.io/badge/HuggingFace-Model-yellow)](https://huggingface.co/nvidia/omnivinci)
 [![Website](https://img.shields.io/badge/Web-Page-orange)](https://nvlabs.github.io/OmniVinci)
@@ -191,10 +103,10 @@ The model is released under the [NVIDIA OneWay Noncommercial License](asset/NVID
 Please consider to cite our paper and this framework, if they are helpful in your research.
 ```bibtex
-@article{ye2025omnivinci,
-  title={OmniVinci: Enhancing Architecture and Data for Omni-Modal Understanding LLM},
-  author={Ye, Hanrong and Yang, Chao-Han Huck and Goel, Arushi and Huang, Wei and Zhu, Ligeng and Su, Yuanhang and Lin, Sean and Cheng, An-Chieh and Wan, Zhen and Tian, Jinchuan and others},
-  journal={arXiv preprint arXiv:2510.15870},
-  year={2025}
 }
 ```

 ---
+license: other
 library_name: transformers
 ---
 # <span style="background: linear-gradient(45deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #f5576c 75%, #4facfe 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; font-weight: bold; font-size: 1.1em;">**OmniVinci: Enhancing Architecture and Data for Omni-Modal Understanding LLM**</span> <br />
+[![Paper](https://img.shields.io/badge/ArXiv-Paper-brown)](arxiv.org/abs/2510.15870 )
 [![Code](https://img.shields.io/badge/GitHub-Link-blue)](https://github.com/NVlabs/OmniVinci)
 [![Model](https://img.shields.io/badge/HuggingFace-Model-yellow)](https://huggingface.co/nvidia/omnivinci)
 [![Website](https://img.shields.io/badge/Web-Page-orange)](https://nvlabs.github.io/OmniVinci)
 Please consider to cite our paper and this framework, if they are helpful in your research.
 ```bibtex
+@article{omnivinci2025,
+      title={OmniVinci: Enhancing Architecture and Data for Omni-Modal Understanding LLM},
+      author={Hanrong Ye, Chao-Han Huck Yang, Arushi Goel, Wei Huang, Ligeng Zhu, Yuanhang Su, Sean Lin, An-Chieh Cheng, Zhen Wan, Jinchuan Tian, Yuming Lou, Dong Yang, Zhijian Liu, Yukang Chen, Ambrish Dantrey, Ehsan Jahangiri, Sreyan Ghosh, Daguang Xu, Ehsan Hosseini-Asl, Danial Mohseni Taheri, Vidya Murali, Sifei Liu, Jason Lu, Oluwatobi Olabiyi, Frank Wang, Rafael Valle, Bryan Catanzaro, Andrew Tao, Song Han, Jan Kautz, Hongxu Yin, Pavlo Molchanov},
+      journal={arXiv},
+      year={2025},
 }
 ```

llm/config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "",
   "architectures": [
     "Qwen2ForCausalLM"
   ],

 {
+  "_name_or_path": "/home/hanrongy/user_path/project/vila/VILA-Internal/../exp_log/nvomni-8b-video-0d1-trope128_omniTwds_ras_audfilter_boost_lr5e6_demoonly_n1_bs128_ga8_mstep-1_j20250923/outputs/model/llm",
   "architectures": [
     "Qwen2ForCausalLM"
   ],

sound_mm_projector/config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "",
   "architectures": [
     "SoundMultimodalProjector"
   ],

 {
+  "_name_or_path": "/lustre/fs12/portfolios/llmservice/projects/llmservice_fm_vision/users/hanrongy/project/vila/VILA-Internal/../exp_log/nvomni-8b-video-0d1-trope128_omniT_ras_n16_bs2048_ga8_mstep-1_j20250718/outputs/model/sound_mm_projector",
   "architectures": [
     "SoundMultimodalProjector"
   ],