Instructions to use xiaomoguhzz/VisionEncoder with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use xiaomoguhzz/VisionEncoder with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("xiaomoguhzz/VisionEncoder", dtype="auto") - Notebooks
- Google Colab
- Kaggle
add V10.x cache qwen3vit/video_10pct_64f
Browse files
data/vmllm_cached/qwen3vit/video_10pct_64f/train/data-00000-of-00001.arrow
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58d83560944aea0ee7eca6f3bc12505fdd54694d356847e706a5e33d6658b11a
|
| 3 |
+
size 157175192
|
data/vmllm_cached/qwen3vit/video_10pct_64f/train/dataset_info.json
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"builder_name": "json",
|
| 3 |
+
"citation": "",
|
| 4 |
+
"config_name": "default",
|
| 5 |
+
"dataset_name": "json",
|
| 6 |
+
"dataset_size": 153484834,
|
| 7 |
+
"description": "",
|
| 8 |
+
"download_checksums": {
|
| 9 |
+
"/share/m2v_intern_v3/wangjunjie09/model_cache/VisionEncoder/ms-swift-data/video_sft_small_10pct_sharegpt.json": {
|
| 10 |
+
"num_bytes": 173305720,
|
| 11 |
+
"checksum": null
|
| 12 |
+
}
|
| 13 |
+
},
|
| 14 |
+
"download_size": 173305720,
|
| 15 |
+
"features": {
|
| 16 |
+
"messages": [
|
| 17 |
+
{
|
| 18 |
+
"content": {
|
| 19 |
+
"dtype": "string",
|
| 20 |
+
"_type": "Value"
|
| 21 |
+
},
|
| 22 |
+
"role": {
|
| 23 |
+
"dtype": "string",
|
| 24 |
+
"_type": "Value"
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
],
|
| 28 |
+
"videos": {
|
| 29 |
+
"feature": {
|
| 30 |
+
"dtype": "string",
|
| 31 |
+
"_type": "Value"
|
| 32 |
+
},
|
| 33 |
+
"_type": "Sequence"
|
| 34 |
+
},
|
| 35 |
+
"lengths": {
|
| 36 |
+
"feature": {
|
| 37 |
+
"dtype": "int64",
|
| 38 |
+
"_type": "Value"
|
| 39 |
+
},
|
| 40 |
+
"_type": "Sequence"
|
| 41 |
+
}
|
| 42 |
+
},
|
| 43 |
+
"homepage": "",
|
| 44 |
+
"license": "",
|
| 45 |
+
"size_in_bytes": 326790554,
|
| 46 |
+
"splits": {
|
| 47 |
+
"train": {
|
| 48 |
+
"name": "train",
|
| 49 |
+
"num_bytes": 153484834,
|
| 50 |
+
"num_examples": 113615,
|
| 51 |
+
"dataset_name": "json"
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"version": {
|
| 55 |
+
"version_str": "0.0.0",
|
| 56 |
+
"major": 0,
|
| 57 |
+
"minor": 0,
|
| 58 |
+
"patch": 0
|
| 59 |
+
}
|
| 60 |
+
}
|
data/vmllm_cached/qwen3vit/video_10pct_64f/train/frame_patch_meta.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"patch": "frame_count_rebase",
|
| 3 |
+
"from_frames": 16,
|
| 4 |
+
"to_frames": 64,
|
| 5 |
+
"delta_per_video": 6912,
|
| 6 |
+
"tokens_per_frame": 144,
|
| 7 |
+
"input": "/share/m2v_intern_v3/wangjunjie09/VisionEncoder/data/vmllm_cached/qwen3vit/video_10pct/train",
|
| 8 |
+
"rows": 113615,
|
| 9 |
+
"lengths_sample_mean_old": 2568.1,
|
| 10 |
+
"lengths_sample_mean_new": 9480.1,
|
| 11 |
+
"timestamp": "2026-06-05T15:34:26"
|
| 12 |
+
}
|
data/vmllm_cached/qwen3vit/video_10pct_64f/train/state.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_data_files": [{"filename": "data-00000-of-00001.arrow"}], "_fingerprint": "72873a6586a1da8a", "_format_columns": null, "_format_kwargs": {}, "_format_type": null, "_output_all_columns": false, "_split": null}
|