nvidia
/

omnivinci

@@ -1,6 +1,94 @@
 ---
-license: other
 library_name: transformers
 ---
 # <span style="background: linear-gradient(45deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #f5576c 75%, #4facfe 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; font-weight: bold; font-size: 1.1em;">**OmniVinci: Enhancing Architecture and Data for Omni-Modal Understanding LLM**</span> <br />

 ---
 library_name: transformers
+license: apache-2.0
+tags:
+- omni-modal
+- multimodal
+- vision
+- audio
+- video
+- llm
+model-index:
+- name: OmniVinci
+  results:
+  - task:
+      type: image-to-text
+      name: Image Understanding
+    dataset:
+      name: MVBench
+      type: mvbench
+    metrics:
+    - name: MVBench Score
+      type: accuracy
+      value: 70.6
+      source:
+        name: OmniVinci Technical Report
+        url: https://arxiv.org/abs/2510.15870
+  - task:
+      type: video-to-text
+      name: Video Understanding
+    dataset:
+      name: Video-MME
+      type: video-mme
+    metrics:
+    - name: Video-MME (w/o sub)
+      type: accuracy
+      value: 68.2
+      source:
+        name: OmniVinci Technical Report
+        url: https://arxiv.org/abs/2510.15870
+  - task:
+      type: video-to-text
+      name: Cross-Modal Understanding
+    dataset:
+      name: DailyOmni
+      type: dailyomni
+    metrics:
+    - name: DailyOmni Score
+      type: accuracy
+      value: 66.5
+      source:
+        name: OmniVinci Technical Report
+        url: https://arxiv.org/abs/2510.15870
+  - task:
+      type: audio-to-text
+      name: Audio Understanding
+    dataset:
+      name: MMAR
+      type: mmar
+    metrics:
+    - name: MMAR Score
+      type: accuracy
+      value: 58.4
+      source:
+        name: OmniVinci Technical Report
+        url: https://arxiv.org/abs/2510.15870
+  - task:
+      type: audio-to-text
+      name: Audio-Only Reasoning
+    dataset:
+      name: MMAU
+      type: mmau
+    metrics:
+    - name: MMAU Score
+      type: accuracy
+      value: 71.6
+      source:
+        name: OmniVinci Technical Report
+        url: https://arxiv.org/abs/2510.15870
+  - task:
+      type: video-to-text
+      name: Multi-Modal Reasoning
+    dataset:
+      name: Worldsense
+      type: worldsense
+    metrics:
+    - name: Worldsense Score
+      type: accuracy
+      value: 48.2
+      source:
+        name: OmniVinci Technical Report
+        url: https://arxiv.org/abs/2510.15870
 ---
 # <span style="background: linear-gradient(45deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #f5576c 75%, #4facfe 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; font-weight: bold; font-size: 1.1em;">**OmniVinci: Enhancing Architecture and Data for Omni-Modal Understanding LLM**</span> <br />