ZhengGeng commited on
Commit
b8c2a35
·
verified ·
1 Parent(s): a8980c9

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Amodal3R/.gitattributes +35 -0
  2. Amodal3R/Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.json +31 -0
  3. Amodal3R/Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.safetensors +3 -0
  4. Amodal3R/Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json +17 -0
  5. Amodal3R/Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors +3 -0
  6. Amodal3R/Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted.json +20 -0
  7. Amodal3R/Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted.safetensors +3 -0
  8. Amodal3R/Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16.json +12 -0
  9. Amodal3R/Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16.safetensors +3 -0
  10. Amodal3R/Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted.json +18 -0
  11. Amodal3R/Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted.safetensors +3 -0
  12. Amodal3R/README.md +13 -0
  13. Amodal3R/pipeline.json +59 -0
  14. FoundationPose/2023-10-28-18-33-37/config.yml +39 -0
  15. FoundationPose/2023-10-28-18-33-37/model_best.pth +3 -0
  16. FoundationPose/2024-01-11-20-02-45/config.yml +41 -0
  17. FoundationPose/2024-01-11-20-02-45/model_best.pth +3 -0
  18. SAM/.gitattributes +34 -0
  19. SAM/README.md +122 -0
  20. SAM/config.json +249 -0
  21. SAM/model.safetensors +3 -0
  22. SAM/preprocessor_config.json +28 -0
  23. SAM/pytorch_model.bin +3 -0
  24. SAM/tf_model.h5 +3 -0
  25. SAM2/sam2_hiera_large.pt +3 -0
  26. SpatialTrackerV2/tracker_offline/.gitattributes +35 -0
  27. SpatialTrackerV2/tracker_offline/README.md +10 -0
  28. SpatialTrackerV2/tracker_offline/config.json +28 -0
  29. SpatialTrackerV2/tracker_offline/model.safetensors +3 -0
  30. SpatialTrackerV2/tracker_online/.gitattributes +35 -0
  31. SpatialTrackerV2/tracker_online/README.md +10 -0
  32. SpatialTrackerV2/tracker_online/config.json +28 -0
  33. SpatialTrackerV2/tracker_online/model.safetensors +3 -0
  34. SpatialTrackerV2/vggt_front/.gitattributes +35 -0
  35. SpatialTrackerV2/vggt_front/README.md +10 -0
  36. SpatialTrackerV2/vggt_front/config.json +5 -0
  37. SpatialTrackerV2/vggt_front/model.safetensors +3 -0
  38. Stable3DGen/.gitattributes +35 -0
  39. Stable3DGen/README.md +9 -0
  40. Stable3DGen/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json +17 -0
  41. Stable3DGen/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors +3 -0
  42. Stable3DGen/ckpts/slat_flow_normal_dit_L_64l8p2_fp16.json +19 -0
  43. Stable3DGen/ckpts/slat_flow_normal_dit_L_64l8p2_fp16.safetensors +3 -0
  44. Stable3DGen/ckpts/ss_dec_conv3d_16l8_fp16.json +12 -0
  45. Stable3DGen/ckpts/ss_dec_conv3d_16l8_fp16.safetensors +3 -0
  46. Stable3DGen/ckpts/ss_flow_normal_dit_L_16l8_fp16.json +17 -0
  47. Stable3DGen/ckpts/ss_flow_normal_dit_L_16l8_fp16.safetensors +3 -0
  48. Stable3DGen/epoch=49-step=123100.ckpt +3 -0
  49. Stable3DGen/pipeline.json +58 -0
  50. SuperGlue/superglue_indoor.pth +3 -0
Amodal3R/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Amodal3R/Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SLatGaussianDecoder",
3
+ "args": {
4
+ "resolution": 64,
5
+ "model_channels": 768,
6
+ "latent_channels": 8,
7
+ "num_blocks": 12,
8
+ "num_heads": 12,
9
+ "mlp_ratio": 4,
10
+ "attn_mode": "swin",
11
+ "window_size": 8,
12
+ "use_fp16": true,
13
+ "representation_config": {
14
+ "lr": {
15
+ "_xyz": 1.0,
16
+ "_features_dc": 1.0,
17
+ "_opacity": 1.0,
18
+ "_scaling": 1.0,
19
+ "_rotation": 0.1
20
+ },
21
+ "perturb_offset": true,
22
+ "voxel_size": 1.5,
23
+ "num_gaussians": 32,
24
+ "2d_filter_kernel_size": 0.1,
25
+ "3d_filter_kernel_size": 9e-4,
26
+ "scaling_bias": 4e-3,
27
+ "opacity_bias": 0.1,
28
+ "scaling_activation": "softplus"
29
+ }
30
+ }
31
+ }
Amodal3R/Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38c84bcef5ce0af1f48b1b5558dabc7575a13346043c41a7e0610f1fa619a161
3
+ size 171450952
Amodal3R/Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SLatMeshDecoder",
3
+ "args": {
4
+ "resolution": 64,
5
+ "model_channels": 768,
6
+ "latent_channels": 8,
7
+ "num_blocks": 12,
8
+ "num_heads": 12,
9
+ "mlp_ratio": 4,
10
+ "attn_mode": "swin",
11
+ "window_size": 8,
12
+ "use_fp16": true,
13
+ "representation_config": {
14
+ "use_color": true
15
+ }
16
+ }
17
+ }
Amodal3R/Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e87aba94b5786407eb06d0502c1ed0885a0027a3f2b8537bfe15b0a92c01859
3
+ size 181903412
Amodal3R/Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SLatFlowModelMaskAsCondWeighted",
3
+ "args": {
4
+ "resolution": 64,
5
+ "in_channels": 8,
6
+ "out_channels": 8,
7
+ "model_channels": 1024,
8
+ "cond_channels": 1024,
9
+ "num_blocks": 24,
10
+ "num_heads": 16,
11
+ "mlp_ratio": 4,
12
+ "patch_size": 2,
13
+ "num_io_res_blocks": 2,
14
+ "io_block_channels": [128],
15
+ "pe_mode": "ape",
16
+ "qk_rms_norm": true,
17
+ "use_fp16": true,
18
+ "mask_cond_type": "mask_patcher"
19
+ }
20
+ }
Amodal3R/Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75d218d4b147828563cd72f6dfc8fbf3489ad4a399aa4ea3ec060686924c0f3b
3
+ size 2804847144
Amodal3R/Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "name": "SparseStructureDecoder",
4
+ "args": {
5
+ "out_channels": 1,
6
+ "latent_channels": 8,
7
+ "num_res_blocks": 2,
8
+ "num_res_blocks_middle": 2,
9
+ "channels": [512, 128, 32],
10
+ "use_fp16": true
11
+ }
12
+ }
Amodal3R/Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c76d4a40519aa2d711cc263a8404105231ac26db31d946bed48b84fee79009a
3
+ size 147591972
Amodal3R/Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SparseStructureFlowModelMaskAsCondWeighted",
3
+ "args": {
4
+ "resolution": 16,
5
+ "in_channels": 8,
6
+ "out_channels": 8,
7
+ "model_channels": 1024,
8
+ "cond_channels": 1024,
9
+ "num_blocks": 24,
10
+ "num_heads": 16,
11
+ "mlp_ratio": 4,
12
+ "patch_size": 1,
13
+ "pe_mode": "ape",
14
+ "qk_rms_norm": true,
15
+ "use_fp16": true,
16
+ "mask_cond_type": "mask_patcher"
17
+ }
18
+ }
Amodal3R/Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d513798910c91ff0ee68f43404b9efc4887794702d45bf7b348a7bafcc9e29d
3
+ size 2642064600
Amodal3R/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ ---
4
+
5
+ # **Amodal3R**: Amodal 3D Reconstruction from Occluded 2D Images
6
+
7
+ Given partially visible objects within images, Amodal3R reconstructs semantically meaningful 3D assets with reasonable geometry and plausible appearance.
8
+
9
+ Arxiv: https://arxiv.org/abs/2503.13439
10
+
11
+ Project Page: https://sm0kywu.github.io/Amodal3R/
12
+
13
+ Our model is built upon the "foundation" model [TRELLIS](https://trellis3d.github.io/). The pre-trained model weights are fetched from https://huggingface.co/JeffreyXiang/TRELLIS-image-large. Thanks to their impressive work!!!
Amodal3R/pipeline.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Amodal3RImageTo3DPipeline",
3
+ "args": {
4
+ "models": {
5
+ "sparse_structure_decoder": "Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16",
6
+ "sparse_structure_flow_model": "Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted",
7
+ "slat_decoder_gs": "Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16",
8
+ "slat_decoder_mesh": "Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16",
9
+ "slat_flow_model": "Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted"
10
+ },
11
+ "sparse_structure_sampler": {
12
+ "name": "FlowEulerGuidanceIntervalSampler",
13
+ "args": {
14
+ "sigma_min": 1e-5
15
+ },
16
+ "params": {
17
+ "steps": 12,
18
+ "cfg_strength": 7.5,
19
+ "cfg_interval": [0.5, 1.0],
20
+ "rescale_t": 3.0
21
+ }
22
+ },
23
+ "slat_sampler": {
24
+ "name": "FlowEulerGuidanceIntervalSampler",
25
+ "args": {
26
+ "sigma_min": 1e-5
27
+ },
28
+ "params": {
29
+ "steps": 12,
30
+ "cfg_strength": 3.0,
31
+ "cfg_interval": [0.5, 1.0],
32
+ "rescale_t": 3.0
33
+ }
34
+ },
35
+ "slat_normalization": {
36
+ "mean": [
37
+ -2.1687545776367188,
38
+ -0.004347046371549368,
39
+ -0.13352349400520325,
40
+ -0.08418072760105133,
41
+ -0.5271206498146057,
42
+ 0.7238689064979553,
43
+ -1.1414450407028198,
44
+ 1.2039363384246826
45
+ ],
46
+ "std": [
47
+ 2.377650737762451,
48
+ 2.386378288269043,
49
+ 2.124418020248413,
50
+ 2.1748552322387695,
51
+ 2.663944721221924,
52
+ 2.371192216873169,
53
+ 2.6217446327209473,
54
+ 2.684523105621338
55
+ ]
56
+ },
57
+ "image_cond_model": "dinov2_vitl14_reg"
58
+ }
59
+ }
FoundationPose/2023-10-28-18-33-37/config.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ lr: 0.0001
2
+ c_in: 6
3
+ zfar: 'Infinity'
4
+ debug: null
5
+ w_rot: 0.1
6
+ n_view: 1
7
+ run_id: null
8
+ use_BN: true
9
+ rot_rep: axis_angle
10
+ ckpt_dir: null
11
+ exp_name: 2023-10-28-18-33-37
12
+ save_dir: /tmp/2023-10-28-18-33-37/
13
+ loss_type: l2
14
+ optimizer: adam
15
+ trans_rep: tracknet
16
+ batch_size: 64
17
+ crop_ratio: 1.2
18
+ use_normal: false
19
+ BN_momentum: 0.1
20
+ max_num_key: null
21
+ warmup_step: -1
22
+ input_resize:
23
+ - 160
24
+ - 160
25
+ max_step_val: 1000
26
+ normal_uint8: false
27
+ vis_interval: 1000
28
+ weight_decay: 0
29
+ n_max_objects: null
30
+ normalize_xyz: true
31
+ clip_grad_norm: 'Infinity'
32
+ rot_normalizer: 0.3490658503988659
33
+ trans_normalizer:
34
+ - 0.019999999552965164
35
+ - 0.019999999552965164
36
+ - 0.05000000074505806
37
+ max_step_per_epoch: 25000
38
+ val_epoch_interval: 10
39
+ n_dataloader_workers: 60
FoundationPose/2023-10-28-18-33-37/model_best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:774700586ddc435d408fc01c9809c43e151232936369dfbea0f0f964ba471d60
3
+ size 68220109
FoundationPose/2024-01-11-20-02-45/config.yml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ lr: 0.0001
2
+ c_in: 6
3
+ zfar: 'Infinity'
4
+ debug: null
5
+ n_view: 1
6
+ run_id: 3wy8qqex
7
+ use_BN: true
8
+ exp_name: 2024-01-11-20-02-45
9
+ n_epochs: 62
10
+ save_dir: /home/bowenw/debug/2024-01-11-20-02-45/
11
+ use_mask: false
12
+ loss_type: pairwise_valid
13
+ optimizer: adam
14
+ batch_size: 64
15
+ crop_ratio: 1.1
16
+ enable_amp: true
17
+ use_normal: false
18
+ max_num_key: null
19
+ warmup_step: -1
20
+ input_resize:
21
+ - 160
22
+ - 160
23
+ max_step_val: 1000
24
+ vis_interval: 1000
25
+ weight_decay: 0
26
+ normalize_xyz: true
27
+ resume_run_id: null
28
+ clip_grad_norm: 'Infinity'
29
+ lr_epoch_decay: 500
30
+ render_backend: nvdiffrast
31
+ train_num_pair: 5
32
+ lr_decay_epochs:
33
+ - 50
34
+ n_epochs_warmup: 1
35
+ make_pair_online: false
36
+ gradient_max_norm: 'Infinity'
37
+ max_step_per_epoch: 10000
38
+ n_rendering_workers: 1
39
+ save_epoch_interval: 100
40
+ n_dataloader_workers: 100
41
+ split_objects_across_gpus: true
FoundationPose/2024-01-11-20-02-45/model_best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81924d384bf5c26c646ee4783104982ae3d1e049c181c36641b6a7aeae494c26
3
+ size 190229389
SAM/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
SAM/README.md ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - vision
5
+ ---
6
+
7
+ # Model Card for Segment Anything Model (SAM) - ViT Large (ViT-L) version
8
+
9
+ <p>
10
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-architecture.png" alt="Model architecture">
11
+ <em> Detailed architecture of Segment Anything Model (SAM).</em>
12
+ </p>
13
+
14
+
15
+ # Table of Contents
16
+
17
+ 0. [TL;DR](#TL;DR)
18
+ 1. [Model Details](#model-details)
19
+ 2. [Usage](#usage)
20
+ 3. [Citation](#citation)
21
+
22
+ # TL;DR
23
+
24
+
25
+ [Link to original repository](https://github.com/facebookresearch/segment-anything)
26
+
27
+ | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-beancans.png" alt="Snow" width="600" height="600"> | <img src="https://huggingface.co/facebook/sam-vit-huge/discussions/7" alt="Forest" width="600" height="600"> | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car-seg.png" alt="Mountains" width="600" height="600"> |
28
+ |---------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------|
29
+
30
+
31
+ The **Segment Anything Model (SAM)** produces high quality object masks from input prompts such as points or boxes, and it can be used to generate masks for all objects in an image. It has been trained on a [dataset](https://segment-anything.com/dataset/index.html) of 11 million images and 1.1 billion masks, and has strong zero-shot performance on a variety of segmentation tasks.
32
+ The abstract of the paper states:
33
+
34
+ > We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at [https://segment-anything.com](https://segment-anything.com) to foster research into foundation models for computer vision.
35
+
36
+ **Disclaimer**: Content from **this** model card has been written by the Hugging Face team, and parts of it were copy pasted from the original [SAM model card](https://github.com/facebookresearch/segment-anything).
37
+
38
+ # Model Details
39
+
40
+ The SAM model is made up of 3 modules:
41
+ - The `VisionEncoder`: a VIT based image encoder. It computes the image embeddings using attention on patches of the image. Relative Positional Embedding is used.
42
+ - The `PromptEncoder`: generates embeddings for points and bounding boxes
43
+ - The `MaskDecoder`: a two-ways transformer which performs cross attention between the image embedding and the point embeddings (->) and between the point embeddings and the image embeddings. The outputs are fed
44
+ - The `Neck`: predicts the output masks based on the contextualized masks produced by the `MaskDecoder`.
45
+ # Usage
46
+
47
+
48
+ ## Prompted-Mask-Generation
49
+
50
+ ```python
51
+ from PIL import Image
52
+ import requests
53
+ from transformers import SamModel, SamProcessor
54
+
55
+ model = SamModel.from_pretrained("facebook/sam-vit-large")
56
+ processor = SamProcessor.from_pretrained("facebook/sam-vit-large")
57
+
58
+ img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
59
+ raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
60
+ input_points = [[[450, 600]]] # 2D localization of a window
61
+ ```
62
+
63
+
64
+ ```python
65
+ inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to("cuda")
66
+ outputs = model(**inputs)
67
+ masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
68
+ scores = outputs.iou_scores
69
+ ```
70
+ Among other arguments to generate masks, you can pass 2D locations on the approximate position of your object of interest, a bounding box wrapping the object of interest (the format should be x, y coordinate of the top right and bottom left point of the bounding box), a segmentation mask. At this time of writing, passing a text as input is not supported by the official model according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844).
71
+ For more details, refer to this notebook, which shows a walk throught of how to use the model, with a visual example!
72
+
73
+ ## Automatic-Mask-Generation
74
+
75
+ The model can be used for generating segmentation masks in a "zero-shot" fashion, given an input image. The model is automatically prompt with a grid of `1024` points
76
+ which are all fed to the model.
77
+
78
+ The pipeline is made for automatic mask generation. The following snippet demonstrates how easy you can run it (on any device! Simply feed the appropriate `points_per_batch` argument)
79
+ ```python
80
+ from transformers import pipeline
81
+ generator = pipeline("mask-generation", device = 0, points_per_batch = 256)
82
+ image_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
83
+ outputs = generator(image_url, points_per_batch = 256)
84
+ ```
85
+ Now to display the image:
86
+ ```python
87
+ import matplotlib.pyplot as plt
88
+ from PIL import Image
89
+ import numpy as np
90
+
91
+ def show_mask(mask, ax, random_color=False):
92
+ if random_color:
93
+ color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
94
+ else:
95
+ color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
96
+ h, w = mask.shape[-2:]
97
+ mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
98
+ ax.imshow(mask_image)
99
+
100
+
101
+ plt.imshow(np.array(raw_image))
102
+ ax = plt.gca()
103
+ for mask in outputs["masks"]:
104
+ show_mask(mask, ax=ax, random_color=True)
105
+ plt.axis("off")
106
+ plt.show()
107
+ ```
108
+
109
+
110
+
111
+ # Citation
112
+
113
+ If you use this model, please use the following BibTeX entry.
114
+
115
+ ```
116
+ @article{kirillov2023segany,
117
+ title={Segment Anything},
118
+ author={Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C. and Lo, Wan-Yen and Doll{\'a}r, Piotr and Girshick, Ross},
119
+ journal={arXiv:2304.02643},
120
+ year={2023}
121
+ }
122
+ ```
SAM/config.json ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/tmp/facebook/sam-vit-large",
4
+ "architectures": [
5
+ "SamModel"
6
+ ],
7
+ "initializer_range": 0.02,
8
+ "mask_decoder_config": {
9
+ "_name_or_path": "",
10
+ "add_cross_attention": false,
11
+ "architectures": null,
12
+ "attention_downsample_rate": 2,
13
+ "bad_words_ids": null,
14
+ "begin_suppress_tokens": null,
15
+ "bos_token_id": null,
16
+ "chunk_size_feed_forward": 0,
17
+ "cross_attention_hidden_size": null,
18
+ "decoder_start_token_id": null,
19
+ "diversity_penalty": 0.0,
20
+ "do_sample": false,
21
+ "early_stopping": false,
22
+ "encoder_no_repeat_ngram_size": 0,
23
+ "eos_token_id": null,
24
+ "exponential_decay_length_penalty": null,
25
+ "finetuning_task": null,
26
+ "forced_bos_token_id": null,
27
+ "forced_eos_token_id": null,
28
+ "hidden_act": "relu",
29
+ "hidden_size": 256,
30
+ "id2label": {
31
+ "0": "LABEL_0",
32
+ "1": "LABEL_1"
33
+ },
34
+ "iou_head_depth": 3,
35
+ "iou_head_hidden_dim": 256,
36
+ "is_decoder": false,
37
+ "is_encoder_decoder": false,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1
41
+ },
42
+ "layer_norm_eps": 1e-06,
43
+ "length_penalty": 1.0,
44
+ "max_length": 20,
45
+ "min_length": 0,
46
+ "mlp_dim": 2048,
47
+ "model_type": "",
48
+ "no_repeat_ngram_size": 0,
49
+ "num_attention_heads": 8,
50
+ "num_beam_groups": 1,
51
+ "num_beams": 1,
52
+ "num_hidden_layers": 2,
53
+ "num_multimask_outputs": 3,
54
+ "num_return_sequences": 1,
55
+ "output_attentions": false,
56
+ "output_hidden_states": false,
57
+ "output_scores": false,
58
+ "pad_token_id": null,
59
+ "prefix": null,
60
+ "problem_type": null,
61
+ "pruned_heads": {},
62
+ "remove_invalid_values": false,
63
+ "repetition_penalty": 1.0,
64
+ "return_dict": true,
65
+ "return_dict_in_generate": false,
66
+ "sep_token_id": null,
67
+ "suppress_tokens": null,
68
+ "task_specific_params": null,
69
+ "temperature": 1.0,
70
+ "tf_legacy_loss": false,
71
+ "tie_encoder_decoder": false,
72
+ "tie_word_embeddings": true,
73
+ "tokenizer_class": null,
74
+ "top_k": 50,
75
+ "top_p": 1.0,
76
+ "torch_dtype": null,
77
+ "torchscript": false,
78
+ "transformers_version": "4.29.0.dev0",
79
+ "typical_p": 1.0,
80
+ "use_bfloat16": false
81
+ },
82
+ "model_type": "sam",
83
+ "prompt_encoder_config": {
84
+ "_name_or_path": "",
85
+ "add_cross_attention": false,
86
+ "architectures": null,
87
+ "bad_words_ids": null,
88
+ "begin_suppress_tokens": null,
89
+ "bos_token_id": null,
90
+ "chunk_size_feed_forward": 0,
91
+ "cross_attention_hidden_size": null,
92
+ "decoder_start_token_id": null,
93
+ "diversity_penalty": 0.0,
94
+ "do_sample": false,
95
+ "early_stopping": false,
96
+ "encoder_no_repeat_ngram_size": 0,
97
+ "eos_token_id": null,
98
+ "exponential_decay_length_penalty": null,
99
+ "finetuning_task": null,
100
+ "forced_bos_token_id": null,
101
+ "forced_eos_token_id": null,
102
+ "hidden_act": "gelu",
103
+ "hidden_size": 256,
104
+ "id2label": {
105
+ "0": "LABEL_0",
106
+ "1": "LABEL_1"
107
+ },
108
+ "image_embedding_size": 64,
109
+ "image_size": 1024,
110
+ "is_decoder": false,
111
+ "is_encoder_decoder": false,
112
+ "label2id": {
113
+ "LABEL_0": 0,
114
+ "LABEL_1": 1
115
+ },
116
+ "layer_norm_eps": 1e-06,
117
+ "length_penalty": 1.0,
118
+ "mask_input_channels": 16,
119
+ "max_length": 20,
120
+ "min_length": 0,
121
+ "model_type": "",
122
+ "no_repeat_ngram_size": 0,
123
+ "num_beam_groups": 1,
124
+ "num_beams": 1,
125
+ "num_point_embeddings": 4,
126
+ "num_return_sequences": 1,
127
+ "output_attentions": false,
128
+ "output_hidden_states": false,
129
+ "output_scores": false,
130
+ "pad_token_id": null,
131
+ "patch_size": 16,
132
+ "prefix": null,
133
+ "problem_type": null,
134
+ "pruned_heads": {},
135
+ "remove_invalid_values": false,
136
+ "repetition_penalty": 1.0,
137
+ "return_dict": true,
138
+ "return_dict_in_generate": false,
139
+ "sep_token_id": null,
140
+ "suppress_tokens": null,
141
+ "task_specific_params": null,
142
+ "temperature": 1.0,
143
+ "tf_legacy_loss": false,
144
+ "tie_encoder_decoder": false,
145
+ "tie_word_embeddings": true,
146
+ "tokenizer_class": null,
147
+ "top_k": 50,
148
+ "top_p": 1.0,
149
+ "torch_dtype": null,
150
+ "torchscript": false,
151
+ "transformers_version": "4.29.0.dev0",
152
+ "typical_p": 1.0,
153
+ "use_bfloat16": false
154
+ },
155
+ "torch_dtype": "float32",
156
+ "transformers_version": null,
157
+ "vision_config": {
158
+ "_name_or_path": "",
159
+ "add_cross_attention": false,
160
+ "architectures": null,
161
+ "attention_dropout": 0.0,
162
+ "bad_words_ids": null,
163
+ "begin_suppress_tokens": null,
164
+ "bos_token_id": null,
165
+ "chunk_size_feed_forward": 0,
166
+ "cross_attention_hidden_size": null,
167
+ "decoder_start_token_id": null,
168
+ "diversity_penalty": 0.0,
169
+ "do_sample": false,
170
+ "dropout": 0.0,
171
+ "early_stopping": false,
172
+ "encoder_no_repeat_ngram_size": 0,
173
+ "eos_token_id": null,
174
+ "exponential_decay_length_penalty": null,
175
+ "finetuning_task": null,
176
+ "forced_bos_token_id": null,
177
+ "forced_eos_token_id": null,
178
+ "global_attn_indexes": [
179
+ 5,
180
+ 11,
181
+ 17,
182
+ 23
183
+ ],
184
+ "hidden_act": "gelu",
185
+ "hidden_size": 1024,
186
+ "id2label": {
187
+ "0": "LABEL_0",
188
+ "1": "LABEL_1"
189
+ },
190
+ "image_size": 1024,
191
+ "initializer_factor": 1.0,
192
+ "initializer_range": 1e-10,
193
+ "intermediate_size": 6144,
194
+ "is_decoder": false,
195
+ "is_encoder_decoder": false,
196
+ "label2id": {
197
+ "LABEL_0": 0,
198
+ "LABEL_1": 1
199
+ },
200
+ "layer_norm_eps": 1e-06,
201
+ "length_penalty": 1.0,
202
+ "max_length": 20,
203
+ "min_length": 0,
204
+ "mlp_dim": 4096,
205
+ "mlp_ratio": 4.0,
206
+ "model_type": "",
207
+ "no_repeat_ngram_size": 0,
208
+ "num_attention_heads": 16,
209
+ "num_beam_groups": 1,
210
+ "num_beams": 1,
211
+ "num_channels": 3,
212
+ "num_hidden_layers": 24,
213
+ "num_pos_feats": 128,
214
+ "num_return_sequences": 1,
215
+ "output_attentions": false,
216
+ "output_channels": 256,
217
+ "output_hidden_states": false,
218
+ "output_scores": false,
219
+ "pad_token_id": null,
220
+ "patch_size": 16,
221
+ "prefix": null,
222
+ "problem_type": null,
223
+ "projection_dim": 512,
224
+ "pruned_heads": {},
225
+ "qkv_bias": true,
226
+ "remove_invalid_values": false,
227
+ "repetition_penalty": 1.0,
228
+ "return_dict": true,
229
+ "return_dict_in_generate": false,
230
+ "sep_token_id": null,
231
+ "suppress_tokens": null,
232
+ "task_specific_params": null,
233
+ "temperature": 1.0,
234
+ "tf_legacy_loss": false,
235
+ "tie_encoder_decoder": false,
236
+ "tie_word_embeddings": true,
237
+ "tokenizer_class": null,
238
+ "top_k": 50,
239
+ "top_p": 1.0,
240
+ "torch_dtype": null,
241
+ "torchscript": false,
242
+ "transformers_version": "4.29.0.dev0",
243
+ "typical_p": 1.0,
244
+ "use_abs_pos": true,
245
+ "use_bfloat16": false,
246
+ "use_rel_pos": true,
247
+ "window_size": 14
248
+ }
249
+ }
SAM/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a57e1b13cd1545938dfcbc9fb26df7f60de6650237a9383382a874a623564b81
3
+ size 1249428136
SAM/preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_pad": true,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_processor_type": "SamImageProcessor",
13
+ "image_std": [
14
+ 0.229,
15
+ 0.224,
16
+ 0.225
17
+ ],
18
+ "pad_size": {
19
+ "height": 1024,
20
+ "width": 1024
21
+ },
22
+ "processor_class": "SamProcessor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "longest_edge": 1024
27
+ }
28
+ }
SAM/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:149bba0bfe0b10f856adb815c37000978ceda04ed3a373c54e565645ae6b7c53
3
+ size 1249536149
SAM/tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:753587fe6b74cd660635f88ea8430afd96cf0267a1f109567fd941d16e69480e
3
+ size 1249899608
SAM2/sam2_hiera_large.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7442e4e9b732a508f80e141e7c2913437a3610ee0c77381a66658c3a445df87b
3
+ size 897952466
SpatialTrackerV2/tracker_offline/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
SpatialTrackerV2/tracker_offline/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - model_hub_mixin
4
+ - pytorch_model_hub_mixin
5
+ ---
6
+
7
+ This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
8
+ - Code: [More Information Needed]
9
+ - Paper: [More Information Needed]
10
+ - Docs: [More Information Needed]
SpatialTrackerV2/tracker_offline/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "Track_cfg": {
4
+ "base": {
5
+ "corr_radius": 3,
6
+ "stride": 4,
7
+ "window_len": 60
8
+ },
9
+ "base_ckpt": "checkpoints/scaled_offline.pth",
10
+ "mode": "online",
11
+ "overlap": 4,
12
+ "s_wind": 200,
13
+ "stablizer": true
14
+ },
15
+ "backbone_cfg": {
16
+ "ckpt_dir": "checkpoints/model.pt"
17
+ },
18
+ "chunk_size": 24,
19
+ "ckpt_fwd": true,
20
+ "ft_cfg": {
21
+ "mode": "fix",
22
+ "paras_name": []
23
+ },
24
+ "max_len": 512,
25
+ "resolution": 336,
26
+ "track_num": 756
27
+ }
28
+ }
SpatialTrackerV2/tracker_offline/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1236958b274867ca9a743303eb2cf48a9d217a7d005e163b45a9ab87ed2e723
3
+ size 275903760
SpatialTrackerV2/tracker_online/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
SpatialTrackerV2/tracker_online/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - model_hub_mixin
4
+ - pytorch_model_hub_mixin
5
+ ---
6
+
7
+ This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
8
+ - Code: [More Information Needed]
9
+ - Paper: [More Information Needed]
10
+ - Docs: [More Information Needed]
SpatialTrackerV2/tracker_online/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "args": {
3
+ "Track_cfg": {
4
+ "base": {
5
+ "corr_radius": 3,
6
+ "stride": 4,
7
+ "window_len": 20
8
+ },
9
+ "base_ckpt": "checkpoints/scaled_online.pth",
10
+ "mode": "online",
11
+ "overlap": 6,
12
+ "s_wind": 20,
13
+ "stablizer": false
14
+ },
15
+ "backbone_cfg": {
16
+ "ckpt_dir": "checkpoints/model.pt"
17
+ },
18
+ "chunk_size": 24,
19
+ "ckpt_fwd": true,
20
+ "ft_cfg": {
21
+ "mode": "fix",
22
+ "paras_name": []
23
+ },
24
+ "max_len": 512,
25
+ "resolution": 336,
26
+ "track_num": 756
27
+ }
28
+ }
SpatialTrackerV2/tracker_online/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31b1d74896e82d6c9330a31c84c93809f445ce1492981c2fe5d73b0eec68cf4a
3
+ size 264150076
SpatialTrackerV2/vggt_front/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
SpatialTrackerV2/vggt_front/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - model_hub_mixin
4
+ - pytorch_model_hub_mixin
5
+ ---
6
+
7
+ This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
8
+ - Code: [More Information Needed]
9
+ - Paper: [More Information Needed]
10
+ - Docs: [More Information Needed]
SpatialTrackerV2/vggt_front/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "img_size": 518,
4
+ "patch_size": 14
5
+ }
SpatialTrackerV2/vggt_front/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28c4377fd8bedfa1f43d4e486dfdce84813b8ce3af57ecce27a93f8a5f22b788
3
+ size 4631919664
Stable3DGen/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Stable3DGen/README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: trellis
3
+ pipeline_tag: image-to-3d
4
+ license: mit
5
+ language:
6
+ - en
7
+ ---
8
+
9
+ An improved normal conditioned version of TRELLIS.
Stable3DGen/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SLatMeshDecoder",
3
+ "args": {
4
+ "resolution": 64,
5
+ "model_channels": 768,
6
+ "latent_channels": 8,
7
+ "num_blocks": 12,
8
+ "num_heads": 12,
9
+ "mlp_ratio": 4,
10
+ "attn_mode": "swin",
11
+ "window_size": 8,
12
+ "use_fp16": true,
13
+ "representation_config": {
14
+ "use_color": true
15
+ }
16
+ }
17
+ }
Stable3DGen/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e87aba94b5786407eb06d0502c1ed0885a0027a3f2b8537bfe15b0a92c01859
3
+ size 181903412
Stable3DGen/ckpts/slat_flow_normal_dit_L_64l8p2_fp16.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SLatFlowModel",
3
+ "args": {
4
+ "resolution": 64,
5
+ "in_channels": 8,
6
+ "out_channels": 8,
7
+ "model_channels": 1024,
8
+ "cond_channels": 1024,
9
+ "num_blocks": 24,
10
+ "num_heads": 16,
11
+ "mlp_ratio": 4,
12
+ "patch_size": 2,
13
+ "num_io_res_blocks": 2,
14
+ "io_block_channels": [128],
15
+ "pe_mode": "ape",
16
+ "qk_rms_norm": true,
17
+ "use_fp16": true
18
+ }
19
+ }
Stable3DGen/ckpts/slat_flow_normal_dit_L_64l8p2_fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9a5896bba3b876e0560fc4a19c335d171b97502e7358b026260b76e0c4557dc
3
+ size 1200919136
Stable3DGen/ckpts/ss_dec_conv3d_16l8_fp16.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "name": "SparseStructureDecoder",
4
+ "args": {
5
+ "out_channels": 1,
6
+ "latent_channels": 8,
7
+ "num_res_blocks": 2,
8
+ "num_res_blocks_middle": 2,
9
+ "channels": [512, 128, 32],
10
+ "use_fp16": true
11
+ }
12
+ }
Stable3DGen/ckpts/ss_dec_conv3d_16l8_fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c76d4a40519aa2d711cc263a8404105231ac26db31d946bed48b84fee79009a
3
+ size 147591972
Stable3DGen/ckpts/ss_flow_normal_dit_L_16l8_fp16.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SparseStructureFlowModel",
3
+ "args": {
4
+ "resolution": 16,
5
+ "in_channels": 8,
6
+ "out_channels": 8,
7
+ "model_channels": 1024,
8
+ "cond_channels": 1024,
9
+ "num_blocks": 24,
10
+ "num_heads": 16,
11
+ "mlp_ratio": 4,
12
+ "patch_size": 1,
13
+ "pe_mode": "ape",
14
+ "qk_rms_norm": true,
15
+ "use_fp16": true
16
+ }
17
+ }
Stable3DGen/ckpts/ss_flow_normal_dit_L_16l8_fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5f9f3aafead23fe5ee49ca01e57fcc0c3e0345635c05a0d83b39a7e5ccaf281
3
+ size 1119525912
Stable3DGen/epoch=49-step=123100.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c24b0e5f6d0adf9592848f728464c25bb6a476273617ef37a112e5d303e300f7
3
+ size 2662776044
Stable3DGen/pipeline.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "TrellisImageTo3DPipeline",
3
+ "args": {
4
+ "models": {
5
+ "sparse_structure_decoder": "ckpts/ss_dec_conv3d_16l8_fp16",
6
+ "sparse_structure_flow_model": "ckpts/ss_flow_normal_dit_L_16l8_fp16",
7
+ "slat_decoder_mesh": "ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16",
8
+ "slat_flow_model": "ckpts/slat_flow_normal_dit_L_64l8p2_fp16"
9
+ },
10
+ "sparse_structure_sampler": {
11
+ "name": "FlowEulerGuidanceIntervalSampler",
12
+ "args": {
13
+ "sigma_min": 1e-5
14
+ },
15
+ "params": {
16
+ "steps": 25,
17
+ "cfg_strength": 5.0,
18
+ "cfg_interval": [0.5, 1.0],
19
+ "rescale_t": 3.0
20
+ }
21
+ },
22
+ "slat_sampler": {
23
+ "name": "FlowEulerGuidanceIntervalSampler",
24
+ "args": {
25
+ "sigma_min": 1e-5
26
+ },
27
+ "params": {
28
+ "steps": 25,
29
+ "cfg_strength": 5.0,
30
+ "cfg_interval": [0.5, 1.0],
31
+ "rescale_t": 3.0
32
+ }
33
+ },
34
+ "slat_normalization": {
35
+ "mean": [
36
+ -2.1687545776367188,
37
+ -0.004347046371549368,
38
+ -0.13352349400520325,
39
+ -0.08418072760105133,
40
+ -0.5271206498146057,
41
+ 0.7238689064979553,
42
+ -1.1414450407028198,
43
+ 1.2039363384246826
44
+ ],
45
+ "std": [
46
+ 2.377650737762451,
47
+ 2.386378288269043,
48
+ 2.124418020248413,
49
+ 2.1748552322387695,
50
+ 2.663944721221924,
51
+ 2.371192216873169,
52
+ 2.6217446327209473,
53
+ 2.684523105621338
54
+ ]
55
+ },
56
+ "image_cond_model": "dinov2_vitl14_reg"
57
+ }
58
+ }
SuperGlue/superglue_indoor.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e710469be25ebe1e2ccf68edcae8b2945b0617c8e7e68412251d9d47f5052b1
3
+ size 48233807