Enhance model card: Add metadata, GitHub link, and tags
#1
by
nielsr
HF Staff
- opened
README.md
CHANGED
|
@@ -1,12 +1,21 @@
|
|
| 1 |
---
|
| 2 |
license: cc-by-sa-4.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
|
|
|
| 4 |
# ViMUL: A Culturally-diverse Multilingual Multimodal Video Model
|
| 5 |
|
| 6 |
[](https://huggingface.co/MBZUAI/ViMUL)
|
| 7 |
[](https://huggingface.co/papers/2506.07032)
|
| 8 |
[](https://mbzuai-oryx.github.io/ViMUL/)
|
| 9 |
[](https://huggingface.co/datasets/MBZUAI/ViMUL-Bench)
|
|
|
|
| 10 |
|
| 11 |
## Overview
|
| 12 |
ViMUL is a multilingual video Large Multimodal Model (LMM) designed to provide better tradeoffs between high and low-resource languages for video understanding. The model is trained on a machine-translated multilingual video training set comprising 1.2 million samples and demonstrates improved performance across culturally diverse video content in multiple languages.
|
|
@@ -75,7 +84,8 @@ def infer(
|
|
| 75 |
video = image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].half().to(device)
|
| 76 |
video = [video]
|
| 77 |
|
| 78 |
-
qs = DEFAULT_IMAGE_TOKEN + "
|
|
|
|
| 79 |
conv = conv_templates[conv_mode].copy() if conv_mode else conv_templates["default"].copy()
|
| 80 |
conv.append_message(conv.roles[0], qs)
|
| 81 |
conv.append_message(conv.roles[1], None)
|
|
@@ -115,7 +125,8 @@ if __name__ == "__main__":
|
|
| 115 |
prompt = "Describe what happens in the video."
|
| 116 |
conv_mode = "qwen_1_5"
|
| 117 |
output = infer(model_path, video_path, prompt, conv_mode=conv_mode)
|
| 118 |
-
print("
|
|
|
|
| 119 |
print("="*40)
|
| 120 |
print("Output:", output)
|
| 121 |
print("="*40)
|
|
@@ -132,4 +143,4 @@ if __name__ == "__main__":
|
|
| 132 |
primaryClass={cs.CL},
|
| 133 |
url={https://arxiv.org/abs/2506.07032},
|
| 134 |
}
|
| 135 |
-
```
|
|
|
|
| 1 |
---
|
| 2 |
license: cc-by-sa-4.0
|
| 3 |
+
pipeline_tag: video-text-to-text
|
| 4 |
+
library_name: transformers
|
| 5 |
+
tags:
|
| 6 |
+
- llava
|
| 7 |
+
- qwen
|
| 8 |
+
- multilingual
|
| 9 |
+
- video-understanding
|
| 10 |
---
|
| 11 |
+
|
| 12 |
# ViMUL: A Culturally-diverse Multilingual Multimodal Video Model
|
| 13 |
|
| 14 |
[](https://huggingface.co/MBZUAI/ViMUL)
|
| 15 |
[](https://huggingface.co/papers/2506.07032)
|
| 16 |
[](https://mbzuai-oryx.github.io/ViMUL/)
|
| 17 |
[](https://huggingface.co/datasets/MBZUAI/ViMUL-Bench)
|
| 18 |
+
[](https://github.com/mbzuai-oryx/ViMUL)
|
| 19 |
|
| 20 |
## Overview
|
| 21 |
ViMUL is a multilingual video Large Multimodal Model (LMM) designed to provide better tradeoffs between high and low-resource languages for video understanding. The model is trained on a machine-translated multilingual video training set comprising 1.2 million samples and demonstrates improved performance across culturally diverse video content in multiple languages.
|
|
|
|
| 84 |
video = image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].half().to(device)
|
| 85 |
video = [video]
|
| 86 |
|
| 87 |
+
qs = DEFAULT_IMAGE_TOKEN + "
|
| 88 |
+
" + prompt
|
| 89 |
conv = conv_templates[conv_mode].copy() if conv_mode else conv_templates["default"].copy()
|
| 90 |
conv.append_message(conv.roles[0], qs)
|
| 91 |
conv.append_message(conv.roles[1], None)
|
|
|
|
| 125 |
prompt = "Describe what happens in the video."
|
| 126 |
conv_mode = "qwen_1_5"
|
| 127 |
output = infer(model_path, video_path, prompt, conv_mode=conv_mode)
|
| 128 |
+
print("
|
| 129 |
+
")
|
| 130 |
print("="*40)
|
| 131 |
print("Output:", output)
|
| 132 |
print("="*40)
|
|
|
|
| 143 |
primaryClass={cs.CL},
|
| 144 |
url={https://arxiv.org/abs/2506.07032},
|
| 145 |
}
|
| 146 |
+
```
|