update
Browse files- README.md +76 -0
- config.json +1 -1
README.md
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- multimodal
|
| 5 |
+
- vision-language
|
| 6 |
+
- video understanding
|
| 7 |
+
- visuospatial cognition
|
| 8 |
+
- spatial reasoning
|
| 9 |
+
- vlm
|
| 10 |
+
- llava
|
| 11 |
+
- qwen
|
| 12 |
+
- siglip
|
| 13 |
+
- hiera
|
| 14 |
+
- sam2
|
| 15 |
+
- dual-encoder
|
| 16 |
+
datasets:
|
| 17 |
+
- liuhaotian/LLaVA-CC3M-Pretrain-595K
|
| 18 |
+
- lmms-lab/LLaVA-OneVision-Data
|
| 19 |
+
- nkkbr/ViCA-322K
|
| 20 |
+
- nkkbr/ViCA-thinking-2.68k
|
| 21 |
+
language:
|
| 22 |
+
- en
|
| 23 |
+
library_name: transformers
|
| 24 |
+
pipeline_tag: video-text-to-text
|
| 25 |
+
model_name: ViCA2-7B
|
| 26 |
+
model_description: |
|
| 27 |
+
ViCA2 (Visuospatial Cognitive Assistant 2) is a state-of-the-art large multimodal model tailored for fine-grained visuospatial reasoning in indoor video and image environments.
|
| 28 |
+
It builds upon the LLaVA-OneVision framework, and introduces a novel dual vision encoder architecture that integrates:
|
| 29 |
+
- **SigLIP** for high-level semantic abstraction, and
|
| 30 |
+
- **Hiera** (from SAM2) for detailed spatial structure modeling.
|
| 31 |
+
|
| 32 |
+
This dual-stream design enables robust performance in tasks involving object layouts, relative positioning, temporal order, and geometric reasoning.
|
| 33 |
+
Trained with a multi-stage strategy on over **322K video-based QA pairs**, ViCA2 significantly surpasses LLaVA-NeXT-Video and Gemini-1.5 Pro.
|
| 34 |
+
|
| 35 |
+
ViCA2 is built with modularity and efficiency in mind, leveraging:
|
| 36 |
+
- Token ratio control for balancing semantic and spatial token contributions
|
| 37 |
+
- Hiera stage-specific sampling and projection
|
| 38 |
+
- Multi-stage DeepSpeed fine-tuning with frozen vision backbones
|
| 39 |
+
model-index:
|
| 40 |
+
- name: ViCA2-7B
|
| 41 |
+
results:
|
| 42 |
+
- task:
|
| 43 |
+
type: visual-question-answering
|
| 44 |
+
dataset:
|
| 45 |
+
name: VSI-Bench
|
| 46 |
+
type: vsi-bench
|
| 47 |
+
metrics:
|
| 48 |
+
- type: score
|
| 49 |
+
value: 56.81
|
| 50 |
+
name: Average
|
| 51 |
+
verified: false
|
| 52 |
+
- type: MRA
|
| 53 |
+
value: 65.73
|
| 54 |
+
name: Object Count
|
| 55 |
+
- type: MRA
|
| 56 |
+
value: 50.98
|
| 57 |
+
name: Absolute Distance
|
| 58 |
+
- type: MRA
|
| 59 |
+
value: 75.54
|
| 60 |
+
name: Object Size
|
| 61 |
+
- type: MRA
|
| 62 |
+
value: 71.42
|
| 63 |
+
name: Room Size
|
| 64 |
+
- type: accuracy
|
| 65 |
+
value: 51.55
|
| 66 |
+
name: Relative Distance
|
| 67 |
+
- type: accuracy
|
| 68 |
+
value: 34.61
|
| 69 |
+
name: Relative Direction
|
| 70 |
+
- type: accuracy
|
| 71 |
+
value: 38.14
|
| 72 |
+
name: Route Plan
|
| 73 |
+
- type: accuracy
|
| 74 |
+
value: 66.50
|
| 75 |
+
name: Appearance Order
|
| 76 |
+
---
|
config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_name_or_path": "/
|
| 3 |
"add_faster_video": false,
|
| 4 |
"add_time_instruction": true,
|
| 5 |
"architectures": [
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "nkkbr/ViCA2",
|
| 3 |
"add_faster_video": false,
|
| 4 |
"add_time_instruction": true,
|
| 5 |
"architectures": [
|