Instructions to use xiaomoguhzz/VisionEncoder with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use xiaomoguhzz/VisionEncoder with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("xiaomoguhzz/VisionEncoder", dtype="auto") - Notebooks
- Google Colab
- Kaggle
add V10.x cache vjepa21/image_10pct
Browse files
data/vmllm_cached/vjepa21/image_10pct/train/data-00000-of-00001.arrow
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dde6c7cd2ce6c46b02dec2761a3aad5040dc692cce2c7f97f505767f4c53a880
|
| 3 |
+
size 69352320
|
data/vmllm_cached/vjepa21/image_10pct/train/dataset_info.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"builder_name": "json",
|
| 3 |
+
"citation": "",
|
| 4 |
+
"config_name": "default",
|
| 5 |
+
"dataset_name": "json",
|
| 6 |
+
"dataset_size": 66130519,
|
| 7 |
+
"description": "",
|
| 8 |
+
"download_checksums": {
|
| 9 |
+
"/share/m2v_intern_v3/wangjunjie09/model_cache/VisionEncoder/ms-swift-data/image_sft_small_10pct_sharegpt.json": {
|
| 10 |
+
"num_bytes": 85235535,
|
| 11 |
+
"checksum": null
|
| 12 |
+
}
|
| 13 |
+
},
|
| 14 |
+
"download_size": 85235535,
|
| 15 |
+
"features": {
|
| 16 |
+
"messages": [
|
| 17 |
+
{
|
| 18 |
+
"content": {
|
| 19 |
+
"dtype": "string",
|
| 20 |
+
"_type": "Value"
|
| 21 |
+
},
|
| 22 |
+
"role": {
|
| 23 |
+
"dtype": "string",
|
| 24 |
+
"_type": "Value"
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
],
|
| 28 |
+
"images": [
|
| 29 |
+
{
|
| 30 |
+
"bytes": {
|
| 31 |
+
"dtype": "null",
|
| 32 |
+
"_type": "Value"
|
| 33 |
+
},
|
| 34 |
+
"path": {
|
| 35 |
+
"dtype": "string",
|
| 36 |
+
"_type": "Value"
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
],
|
| 40 |
+
"lengths": {
|
| 41 |
+
"feature": {
|
| 42 |
+
"dtype": "int64",
|
| 43 |
+
"_type": "Value"
|
| 44 |
+
},
|
| 45 |
+
"_type": "Sequence"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"homepage": "",
|
| 49 |
+
"license": "",
|
| 50 |
+
"size_in_bytes": 151366054,
|
| 51 |
+
"splits": {
|
| 52 |
+
"train": {
|
| 53 |
+
"name": "train",
|
| 54 |
+
"num_bytes": 66130519,
|
| 55 |
+
"num_examples": 73859,
|
| 56 |
+
"dataset_name": "json"
|
| 57 |
+
}
|
| 58 |
+
},
|
| 59 |
+
"version": {
|
| 60 |
+
"version_str": "0.0.0",
|
| 61 |
+
"major": 0,
|
| 62 |
+
"minor": 0,
|
| 63 |
+
"patch": 0
|
| 64 |
+
}
|
| 65 |
+
}
|
data/vmllm_cached/vjepa21/image_10pct/train/state.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_data_files": [{"filename": "data-00000-of-00001.arrow"}], "_fingerprint": "8b508c67f2fe56fe", "_format_columns": null, "_format_kwargs": {}, "_format_type": null, "_output_all_columns": false, "_split": null}
|
data/vmllm_cached/vjepa21/image_10pct/train/vjepa21_patch_meta.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"patcher": "siglip2_to_vjepa21",
|
| 3 |
+
"mode": "image",
|
| 4 |
+
"from": "siglip2 (SigLIP2 base, num_image_tokens=729, video pool 14\u00b2)",
|
| 5 |
+
"to": "vjepa21 (V-JEPA 2.1 ViT-L, num_image_tokens=576, video tubelet skip-pool)",
|
| 6 |
+
"delta_formula": "image: lengths[0] += sum(per-image stock _get_number_of_features delta) (num_image_tokens 729 \u2192 576, via LlavaOnevisionProcessor method rebind to stub; captures unpadded + newline + base \u4e09\u9879 \u03b4)",
|
| 7 |
+
"input": "/share/m2v_intern_v3/wangjunjie09/VisionEncoder/data/vmllm_cached/siglip2/image_10pct/train",
|
| 8 |
+
"rows": 73859,
|
| 9 |
+
"lengths_sample_mean_old": 3245.27,
|
| 10 |
+
"lengths_sample_mean_new": 2624.35,
|
| 11 |
+
"timestamp": "2026-05-11T16:18:06"
|
| 12 |
+
}
|