diff --git a/Amodal3R/.gitattributes b/Amodal3R/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..dab9a4e17afd2ef39d90ccb0b40ef2786fe77422 --- /dev/null +++ b/Amodal3R/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Amodal3R/Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.json b/Amodal3R/Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..051ade1e3b374738bf7a007516275f72e458f2a7 --- /dev/null +++ b/Amodal3R/Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.json @@ -0,0 +1,31 @@ +{ + "name": "SLatGaussianDecoder", + "args": { + "resolution": 64, + "model_channels": 768, + "latent_channels": 8, + "num_blocks": 12, + "num_heads": 12, + "mlp_ratio": 4, + "attn_mode": "swin", + "window_size": 8, + "use_fp16": true, + "representation_config": { + "lr": { + "_xyz": 1.0, + "_features_dc": 1.0, + "_opacity": 1.0, + "_scaling": 1.0, + "_rotation": 0.1 + }, + "perturb_offset": true, + "voxel_size": 1.5, + "num_gaussians": 32, + "2d_filter_kernel_size": 0.1, + "3d_filter_kernel_size": 9e-4, + "scaling_bias": 4e-3, + "opacity_bias": 0.1, + "scaling_activation": "softplus" + } + } +} \ No newline at end of file diff --git a/Amodal3R/Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.safetensors b/Amodal3R/Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3a8ca2e522bee8e81fad8bb375064925a6b70ffb --- /dev/null +++ b/Amodal3R/Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38c84bcef5ce0af1f48b1b5558dabc7575a13346043c41a7e0610f1fa619a161 +size 171450952 diff --git a/Amodal3R/Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json b/Amodal3R/Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..28802825e9ba04bcc2cfdffb491dc9dbb72c8a38 --- /dev/null +++ b/Amodal3R/Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json @@ -0,0 +1,17 @@ +{ + "name": "SLatMeshDecoder", + "args": { + "resolution": 64, + "model_channels": 768, + "latent_channels": 8, + "num_blocks": 12, + "num_heads": 12, + "mlp_ratio": 4, + "attn_mode": "swin", + "window_size": 8, + "use_fp16": true, + "representation_config": { + "use_color": true + } + } +} \ No newline at end of file diff --git a/Amodal3R/Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors b/Amodal3R/Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..da28e99a05f40ab6f63172aaef5d695dfb1ab899 --- /dev/null +++ b/Amodal3R/Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e87aba94b5786407eb06d0502c1ed0885a0027a3f2b8537bfe15b0a92c01859 +size 181903412 diff --git a/Amodal3R/Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted.json b/Amodal3R/Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted.json new file mode 100644 index 0000000000000000000000000000000000000000..3706080a3308b37ac004dcb5a631c80dd4456398 --- /dev/null +++ b/Amodal3R/Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted.json @@ -0,0 +1,20 @@ +{ + "name": "SLatFlowModelMaskAsCondWeighted", + "args": { + "resolution": 64, + "in_channels": 8, + "out_channels": 8, + "model_channels": 1024, + "cond_channels": 1024, + "num_blocks": 24, + "num_heads": 16, + "mlp_ratio": 4, + "patch_size": 2, + "num_io_res_blocks": 2, + "io_block_channels": [128], + "pe_mode": "ape", + "qk_rms_norm": true, + "use_fp16": true, + "mask_cond_type": "mask_patcher" + } +} \ No newline at end of file diff --git a/Amodal3R/Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted.safetensors b/Amodal3R/Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..199b0b8ba3fbb63577f17156b9c109f0f0276588 --- /dev/null +++ b/Amodal3R/Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75d218d4b147828563cd72f6dfc8fbf3489ad4a399aa4ea3ec060686924c0f3b +size 2804847144 diff --git a/Amodal3R/Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16.json b/Amodal3R/Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..9f3affaf13ab29fe48105229da9fab72ea8de716 --- /dev/null +++ b/Amodal3R/Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16.json @@ -0,0 +1,12 @@ + +{ + "name": "SparseStructureDecoder", + "args": { + "out_channels": 1, + "latent_channels": 8, + "num_res_blocks": 2, + "num_res_blocks_middle": 2, + "channels": [512, 128, 32], + "use_fp16": true + } +} \ No newline at end of file diff --git a/Amodal3R/Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16.safetensors b/Amodal3R/Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a4b577752af35caab154c83b72427d9cc92285f6 --- /dev/null +++ b/Amodal3R/Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c76d4a40519aa2d711cc263a8404105231ac26db31d946bed48b84fee79009a +size 147591972 diff --git a/Amodal3R/Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted.json b/Amodal3R/Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted.json new file mode 100644 index 0000000000000000000000000000000000000000..5af82ac804815cd963d03d70e64df9ea8bc48846 --- /dev/null +++ b/Amodal3R/Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted.json @@ -0,0 +1,18 @@ +{ + "name": "SparseStructureFlowModelMaskAsCondWeighted", + "args": { + "resolution": 16, + "in_channels": 8, + "out_channels": 8, + "model_channels": 1024, + "cond_channels": 1024, + "num_blocks": 24, + "num_heads": 16, + "mlp_ratio": 4, + "patch_size": 1, + "pe_mode": "ape", + "qk_rms_norm": true, + "use_fp16": true, + "mask_cond_type": "mask_patcher" + } +} \ No newline at end of file diff --git a/Amodal3R/Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted.safetensors b/Amodal3R/Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ea81edf5bf8b67dc5342f4c6355bae1da2854e5 --- /dev/null +++ b/Amodal3R/Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d513798910c91ff0ee68f43404b9efc4887794702d45bf7b348a7bafcc9e29d +size 2642064600 diff --git a/Amodal3R/README.md b/Amodal3R/README.md new file mode 100644 index 0000000000000000000000000000000000000000..447e9a692ea65fa45883497b9177923ddb17bf8d --- /dev/null +++ b/Amodal3R/README.md @@ -0,0 +1,13 @@ +--- +license: cc-by-4.0 +--- + +# **Amodal3R**: Amodal 3D Reconstruction from Occluded 2D Images + +Given partially visible objects within images, Amodal3R reconstructs semantically meaningful 3D assets with reasonable geometry and plausible appearance. + +Arxiv: https://arxiv.org/abs/2503.13439 + +Project Page: https://sm0kywu.github.io/Amodal3R/ + +Our model is built upon the "foundation" model [TRELLIS](https://trellis3d.github.io/). The pre-trained model weights are fetched from https://huggingface.co/JeffreyXiang/TRELLIS-image-large. Thanks to their impressive work!!! diff --git a/Amodal3R/pipeline.json b/Amodal3R/pipeline.json new file mode 100644 index 0000000000000000000000000000000000000000..9174cc14c79f17ecc5f61e2e9bd7ce1b8a7c9011 --- /dev/null +++ b/Amodal3R/pipeline.json @@ -0,0 +1,59 @@ +{ + "name": "Amodal3RImageTo3DPipeline", + "args": { + "models": { + "sparse_structure_decoder": "Amodal3R_ckpts/ss_dec_conv3d_16l8_fp16", + "sparse_structure_flow_model": "Amodal3R_ckpts/ss_flow_img_dit_L_16l8_fp16_doubleattn_weighted", + "slat_decoder_gs": "Amodal3R_ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16", + "slat_decoder_mesh": "Amodal3R_ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16", + "slat_flow_model": "Amodal3R_ckpts/slat_flow_img_dit_L_64l8p2_fp16_doubleattn_weighted" + }, + "sparse_structure_sampler": { + "name": "FlowEulerGuidanceIntervalSampler", + "args": { + "sigma_min": 1e-5 + }, + "params": { + "steps": 12, + "cfg_strength": 7.5, + "cfg_interval": [0.5, 1.0], + "rescale_t": 3.0 + } + }, + "slat_sampler": { + "name": "FlowEulerGuidanceIntervalSampler", + "args": { + "sigma_min": 1e-5 + }, + "params": { + "steps": 12, + "cfg_strength": 3.0, + "cfg_interval": [0.5, 1.0], + "rescale_t": 3.0 + } + }, + "slat_normalization": { + "mean": [ + -2.1687545776367188, + -0.004347046371549368, + -0.13352349400520325, + -0.08418072760105133, + -0.5271206498146057, + 0.7238689064979553, + -1.1414450407028198, + 1.2039363384246826 + ], + "std": [ + 2.377650737762451, + 2.386378288269043, + 2.124418020248413, + 2.1748552322387695, + 2.663944721221924, + 2.371192216873169, + 2.6217446327209473, + 2.684523105621338 + ] + }, + "image_cond_model": "dinov2_vitl14_reg" + } +} \ No newline at end of file diff --git a/FoundationPose/2023-10-28-18-33-37/config.yml b/FoundationPose/2023-10-28-18-33-37/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..d962a1d9a713fcb5f3e125e4fac7cdff3fd59c2f --- /dev/null +++ b/FoundationPose/2023-10-28-18-33-37/config.yml @@ -0,0 +1,39 @@ +lr: 0.0001 +c_in: 6 +zfar: 'Infinity' +debug: null +w_rot: 0.1 +n_view: 1 +run_id: null +use_BN: true +rot_rep: axis_angle +ckpt_dir: null +exp_name: 2023-10-28-18-33-37 +save_dir: /tmp/2023-10-28-18-33-37/ +loss_type: l2 +optimizer: adam +trans_rep: tracknet +batch_size: 64 +crop_ratio: 1.2 +use_normal: false +BN_momentum: 0.1 +max_num_key: null +warmup_step: -1 +input_resize: +- 160 +- 160 +max_step_val: 1000 +normal_uint8: false +vis_interval: 1000 +weight_decay: 0 +n_max_objects: null +normalize_xyz: true +clip_grad_norm: 'Infinity' +rot_normalizer: 0.3490658503988659 +trans_normalizer: +- 0.019999999552965164 +- 0.019999999552965164 +- 0.05000000074505806 +max_step_per_epoch: 25000 +val_epoch_interval: 10 +n_dataloader_workers: 60 diff --git a/FoundationPose/2023-10-28-18-33-37/model_best.pth b/FoundationPose/2023-10-28-18-33-37/model_best.pth new file mode 100644 index 0000000000000000000000000000000000000000..9cb47b3f6babe6e415ea8a9905170bb670ab8a14 --- /dev/null +++ b/FoundationPose/2023-10-28-18-33-37/model_best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:774700586ddc435d408fc01c9809c43e151232936369dfbea0f0f964ba471d60 +size 68220109 diff --git a/FoundationPose/2024-01-11-20-02-45/config.yml b/FoundationPose/2024-01-11-20-02-45/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..69cf5c058495cc5dda87fe845fe6efd6188ceb7b --- /dev/null +++ b/FoundationPose/2024-01-11-20-02-45/config.yml @@ -0,0 +1,41 @@ +lr: 0.0001 +c_in: 6 +zfar: 'Infinity' +debug: null +n_view: 1 +run_id: 3wy8qqex +use_BN: true +exp_name: 2024-01-11-20-02-45 +n_epochs: 62 +save_dir: /home/bowenw/debug/2024-01-11-20-02-45/ +use_mask: false +loss_type: pairwise_valid +optimizer: adam +batch_size: 64 +crop_ratio: 1.1 +enable_amp: true +use_normal: false +max_num_key: null +warmup_step: -1 +input_resize: +- 160 +- 160 +max_step_val: 1000 +vis_interval: 1000 +weight_decay: 0 +normalize_xyz: true +resume_run_id: null +clip_grad_norm: 'Infinity' +lr_epoch_decay: 500 +render_backend: nvdiffrast +train_num_pair: 5 +lr_decay_epochs: +- 50 +n_epochs_warmup: 1 +make_pair_online: false +gradient_max_norm: 'Infinity' +max_step_per_epoch: 10000 +n_rendering_workers: 1 +save_epoch_interval: 100 +n_dataloader_workers: 100 +split_objects_across_gpus: true diff --git a/FoundationPose/2024-01-11-20-02-45/model_best.pth b/FoundationPose/2024-01-11-20-02-45/model_best.pth new file mode 100644 index 0000000000000000000000000000000000000000..0322d30546694ad55de90ef320e67e30f9a64eae --- /dev/null +++ b/FoundationPose/2024-01-11-20-02-45/model_best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81924d384bf5c26c646ee4783104982ae3d1e049c181c36641b6a7aeae494c26 +size 190229389 diff --git a/SAM/.gitattributes b/SAM/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..c7d9f3332a950355d5a77d85000f05e6f45435ea --- /dev/null +++ b/SAM/.gitattributes @@ -0,0 +1,34 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/SAM/README.md b/SAM/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f8fa0907699c9494e50d6c12baebbb158a3efe2c --- /dev/null +++ b/SAM/README.md @@ -0,0 +1,122 @@ +--- +license: apache-2.0 +tags: +- vision +--- + +# Model Card for Segment Anything Model (SAM) - ViT Large (ViT-L) version + +

+ Model architecture + Detailed architecture of Segment Anything Model (SAM). +

+ + +# Table of Contents + +0. [TL;DR](#TL;DR) +1. [Model Details](#model-details) +2. [Usage](#usage) +3. [Citation](#citation) + +# TL;DR + + +[Link to original repository](https://github.com/facebookresearch/segment-anything) + +| Snow | Forest | Mountains | +|---------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------| + + +The **Segment Anything Model (SAM)** produces high quality object masks from input prompts such as points or boxes, and it can be used to generate masks for all objects in an image. It has been trained on a [dataset](https://segment-anything.com/dataset/index.html) of 11 million images and 1.1 billion masks, and has strong zero-shot performance on a variety of segmentation tasks. +The abstract of the paper states: + +> We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at [https://segment-anything.com](https://segment-anything.com) to foster research into foundation models for computer vision. + +**Disclaimer**: Content from **this** model card has been written by the Hugging Face team, and parts of it were copy pasted from the original [SAM model card](https://github.com/facebookresearch/segment-anything). + +# Model Details + +The SAM model is made up of 3 modules: + - The `VisionEncoder`: a VIT based image encoder. It computes the image embeddings using attention on patches of the image. Relative Positional Embedding is used. + - The `PromptEncoder`: generates embeddings for points and bounding boxes + - The `MaskDecoder`: a two-ways transformer which performs cross attention between the image embedding and the point embeddings (->) and between the point embeddings and the image embeddings. The outputs are fed + - The `Neck`: predicts the output masks based on the contextualized masks produced by the `MaskDecoder`. +# Usage + + +## Prompted-Mask-Generation + +```python +from PIL import Image +import requests +from transformers import SamModel, SamProcessor + +model = SamModel.from_pretrained("facebook/sam-vit-large") +processor = SamProcessor.from_pretrained("facebook/sam-vit-large") + +img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" +raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") +input_points = [[[450, 600]]] # 2D localization of a window +``` + + +```python +inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to("cuda") +outputs = model(**inputs) +masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()) +scores = outputs.iou_scores +``` +Among other arguments to generate masks, you can pass 2D locations on the approximate position of your object of interest, a bounding box wrapping the object of interest (the format should be x, y coordinate of the top right and bottom left point of the bounding box), a segmentation mask. At this time of writing, passing a text as input is not supported by the official model according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844). +For more details, refer to this notebook, which shows a walk throught of how to use the model, with a visual example! + +## Automatic-Mask-Generation + +The model can be used for generating segmentation masks in a "zero-shot" fashion, given an input image. The model is automatically prompt with a grid of `1024` points +which are all fed to the model. + +The pipeline is made for automatic mask generation. The following snippet demonstrates how easy you can run it (on any device! Simply feed the appropriate `points_per_batch` argument) +```python +from transformers import pipeline +generator = pipeline("mask-generation", device = 0, points_per_batch = 256) +image_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" +outputs = generator(image_url, points_per_batch = 256) +``` +Now to display the image: +```python +import matplotlib.pyplot as plt +from PIL import Image +import numpy as np + +def show_mask(mask, ax, random_color=False): + if random_color: + color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) + else: + color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6]) + h, w = mask.shape[-2:] + mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) + ax.imshow(mask_image) + + +plt.imshow(np.array(raw_image)) +ax = plt.gca() +for mask in outputs["masks"]: + show_mask(mask, ax=ax, random_color=True) +plt.axis("off") +plt.show() +``` + + + +# Citation + +If you use this model, please use the following BibTeX entry. + +``` +@article{kirillov2023segany, + title={Segment Anything}, + author={Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C. and Lo, Wan-Yen and Doll{\'a}r, Piotr and Girshick, Ross}, + journal={arXiv:2304.02643}, + year={2023} +} +``` \ No newline at end of file diff --git a/SAM/config.json b/SAM/config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfcc46262a8a8301a0ed49a4daf598e2c06dda42 --- /dev/null +++ b/SAM/config.json @@ -0,0 +1,249 @@ +{ + "_commit_hash": null, + "_name_or_path": "/tmp/facebook/sam-vit-large", + "architectures": [ + "SamModel" + ], + "initializer_range": 0.02, + "mask_decoder_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_downsample_rate": 2, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "relu", + "hidden_size": 256, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "iou_head_depth": 3, + "iou_head_hidden_dim": 256, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "mlp_dim": 2048, + "model_type": "", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 2, + "num_multimask_outputs": 3, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "transformers_version": "4.29.0.dev0", + "typical_p": 1.0, + "use_bfloat16": false + }, + "model_type": "sam", + "prompt_encoder_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 256, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_embedding_size": 64, + "image_size": 1024, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "mask_input_channels": 16, + "max_length": 20, + "min_length": 0, + "model_type": "", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_point_embeddings": 4, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 16, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "transformers_version": "4.29.0.dev0", + "typical_p": 1.0, + "use_bfloat16": false + }, + "torch_dtype": "float32", + "transformers_version": null, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "global_attn_indexes": [ + 5, + 11, + 17, + 23 + ], + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 1024, + "initializer_factor": 1.0, + "initializer_range": 1e-10, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "mlp_dim": 4096, + "mlp_ratio": 4.0, + "model_type": "", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_pos_feats": 128, + "num_return_sequences": 1, + "output_attentions": false, + "output_channels": 256, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 16, + "prefix": null, + "problem_type": null, + "projection_dim": 512, + "pruned_heads": {}, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "transformers_version": "4.29.0.dev0", + "typical_p": 1.0, + "use_abs_pos": true, + "use_bfloat16": false, + "use_rel_pos": true, + "window_size": 14 + } +} diff --git a/SAM/model.safetensors b/SAM/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..37b43d93db0c686bb99df506df115d5065cae4ab --- /dev/null +++ b/SAM/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a57e1b13cd1545938dfcbc9fb26df7f60de6650237a9383382a874a623564b81 +size 1249428136 diff --git a/SAM/preprocessor_config.json b/SAM/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..732fbaf0c512b97d8d9161f51bc157bfb2873d12 --- /dev/null +++ b/SAM/preprocessor_config.json @@ -0,0 +1,28 @@ +{ + "do_convert_rgb": true, + "do_normalize": true, + "do_pad": true, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.485, + 0.456, + 0.406 + ], + "image_processor_type": "SamImageProcessor", + "image_std": [ + 0.229, + 0.224, + 0.225 + ], + "pad_size": { + "height": 1024, + "width": 1024 + }, + "processor_class": "SamProcessor", + "resample": 2, + "rescale_factor": 0.00392156862745098, + "size": { + "longest_edge": 1024 + } +} diff --git a/SAM/pytorch_model.bin b/SAM/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c087d49a29b9fe267f099db14b75b4653a92d381 --- /dev/null +++ b/SAM/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:149bba0bfe0b10f856adb815c37000978ceda04ed3a373c54e565645ae6b7c53 +size 1249536149 diff --git a/SAM/tf_model.h5 b/SAM/tf_model.h5 new file mode 100644 index 0000000000000000000000000000000000000000..c39c4a1808fab33bd757f4583f32665a7ab56899 --- /dev/null +++ b/SAM/tf_model.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:753587fe6b74cd660635f88ea8430afd96cf0267a1f109567fd941d16e69480e +size 1249899608 diff --git a/SAM2/sam2_hiera_large.pt b/SAM2/sam2_hiera_large.pt new file mode 100644 index 0000000000000000000000000000000000000000..7198ee4779a9e91db4d79bdc80e188cc182482e0 --- /dev/null +++ b/SAM2/sam2_hiera_large.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7442e4e9b732a508f80e141e7c2913437a3610ee0c77381a66658c3a445df87b +size 897952466 diff --git a/SpatialTrackerV2/tracker_offline/.gitattributes b/SpatialTrackerV2/tracker_offline/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/SpatialTrackerV2/tracker_offline/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/SpatialTrackerV2/tracker_offline/README.md b/SpatialTrackerV2/tracker_offline/README.md new file mode 100644 index 0000000000000000000000000000000000000000..515962fb9f765195c14254deb0878b47c7d0ca5e --- /dev/null +++ b/SpatialTrackerV2/tracker_offline/README.md @@ -0,0 +1,10 @@ +--- +tags: +- model_hub_mixin +- pytorch_model_hub_mixin +--- + +This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration: +- Code: [More Information Needed] +- Paper: [More Information Needed] +- Docs: [More Information Needed] \ No newline at end of file diff --git a/SpatialTrackerV2/tracker_offline/config.json b/SpatialTrackerV2/tracker_offline/config.json new file mode 100644 index 0000000000000000000000000000000000000000..f1d2a7bbff70adf9bbbf65bc4f6dc528885e3404 --- /dev/null +++ b/SpatialTrackerV2/tracker_offline/config.json @@ -0,0 +1,28 @@ +{ + "args": { + "Track_cfg": { + "base": { + "corr_radius": 3, + "stride": 4, + "window_len": 60 + }, + "base_ckpt": "checkpoints/scaled_offline.pth", + "mode": "online", + "overlap": 4, + "s_wind": 200, + "stablizer": true + }, + "backbone_cfg": { + "ckpt_dir": "checkpoints/model.pt" + }, + "chunk_size": 24, + "ckpt_fwd": true, + "ft_cfg": { + "mode": "fix", + "paras_name": [] + }, + "max_len": 512, + "resolution": 336, + "track_num": 756 + } +} \ No newline at end of file diff --git a/SpatialTrackerV2/tracker_offline/model.safetensors b/SpatialTrackerV2/tracker_offline/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3d2a7d927ce33d7ca5ccfb50e60f2c8c1be3bd77 --- /dev/null +++ b/SpatialTrackerV2/tracker_offline/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1236958b274867ca9a743303eb2cf48a9d217a7d005e163b45a9ab87ed2e723 +size 275903760 diff --git a/SpatialTrackerV2/tracker_online/.gitattributes b/SpatialTrackerV2/tracker_online/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/SpatialTrackerV2/tracker_online/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/SpatialTrackerV2/tracker_online/README.md b/SpatialTrackerV2/tracker_online/README.md new file mode 100644 index 0000000000000000000000000000000000000000..515962fb9f765195c14254deb0878b47c7d0ca5e --- /dev/null +++ b/SpatialTrackerV2/tracker_online/README.md @@ -0,0 +1,10 @@ +--- +tags: +- model_hub_mixin +- pytorch_model_hub_mixin +--- + +This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration: +- Code: [More Information Needed] +- Paper: [More Information Needed] +- Docs: [More Information Needed] \ No newline at end of file diff --git a/SpatialTrackerV2/tracker_online/config.json b/SpatialTrackerV2/tracker_online/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e821ab00c7a2fd2dba6f96c97956613600281324 --- /dev/null +++ b/SpatialTrackerV2/tracker_online/config.json @@ -0,0 +1,28 @@ +{ + "args": { + "Track_cfg": { + "base": { + "corr_radius": 3, + "stride": 4, + "window_len": 20 + }, + "base_ckpt": "checkpoints/scaled_online.pth", + "mode": "online", + "overlap": 6, + "s_wind": 20, + "stablizer": false + }, + "backbone_cfg": { + "ckpt_dir": "checkpoints/model.pt" + }, + "chunk_size": 24, + "ckpt_fwd": true, + "ft_cfg": { + "mode": "fix", + "paras_name": [] + }, + "max_len": 512, + "resolution": 336, + "track_num": 756 + } +} \ No newline at end of file diff --git a/SpatialTrackerV2/tracker_online/model.safetensors b/SpatialTrackerV2/tracker_online/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..049a8b8a958795ba781c7daa0c0d2b8fe1bca257 --- /dev/null +++ b/SpatialTrackerV2/tracker_online/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31b1d74896e82d6c9330a31c84c93809f445ce1492981c2fe5d73b0eec68cf4a +size 264150076 diff --git a/SpatialTrackerV2/vggt_front/.gitattributes b/SpatialTrackerV2/vggt_front/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/SpatialTrackerV2/vggt_front/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/SpatialTrackerV2/vggt_front/README.md b/SpatialTrackerV2/vggt_front/README.md new file mode 100644 index 0000000000000000000000000000000000000000..515962fb9f765195c14254deb0878b47c7d0ca5e --- /dev/null +++ b/SpatialTrackerV2/vggt_front/README.md @@ -0,0 +1,10 @@ +--- +tags: +- model_hub_mixin +- pytorch_model_hub_mixin +--- + +This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration: +- Code: [More Information Needed] +- Paper: [More Information Needed] +- Docs: [More Information Needed] \ No newline at end of file diff --git a/SpatialTrackerV2/vggt_front/config.json b/SpatialTrackerV2/vggt_front/config.json new file mode 100644 index 0000000000000000000000000000000000000000..303bf21400e2723e8ff9c0c7ceb6d86859b1ddeb --- /dev/null +++ b/SpatialTrackerV2/vggt_front/config.json @@ -0,0 +1,5 @@ +{ + "embed_dim": 1024, + "img_size": 518, + "patch_size": 14 +} \ No newline at end of file diff --git a/SpatialTrackerV2/vggt_front/model.safetensors b/SpatialTrackerV2/vggt_front/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dbc54b65c68e6b08b3df16857a824ae2f2535820 --- /dev/null +++ b/SpatialTrackerV2/vggt_front/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28c4377fd8bedfa1f43d4e486dfdce84813b8ce3af57ecce27a93f8a5f22b788 +size 4631919664 diff --git a/Stable3DGen/.gitattributes b/Stable3DGen/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Stable3DGen/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Stable3DGen/README.md b/Stable3DGen/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3c96cc0e58f27a85e5b4f25af9d29e58ee385681 --- /dev/null +++ b/Stable3DGen/README.md @@ -0,0 +1,9 @@ +--- +library_name: trellis +pipeline_tag: image-to-3d +license: mit +language: +- en +--- + +An improved normal conditioned version of TRELLIS. diff --git a/Stable3DGen/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json b/Stable3DGen/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..28802825e9ba04bcc2cfdffb491dc9dbb72c8a38 --- /dev/null +++ b/Stable3DGen/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json @@ -0,0 +1,17 @@ +{ + "name": "SLatMeshDecoder", + "args": { + "resolution": 64, + "model_channels": 768, + "latent_channels": 8, + "num_blocks": 12, + "num_heads": 12, + "mlp_ratio": 4, + "attn_mode": "swin", + "window_size": 8, + "use_fp16": true, + "representation_config": { + "use_color": true + } + } +} \ No newline at end of file diff --git a/Stable3DGen/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors b/Stable3DGen/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..da28e99a05f40ab6f63172aaef5d695dfb1ab899 --- /dev/null +++ b/Stable3DGen/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e87aba94b5786407eb06d0502c1ed0885a0027a3f2b8537bfe15b0a92c01859 +size 181903412 diff --git a/Stable3DGen/ckpts/slat_flow_normal_dit_L_64l8p2_fp16.json b/Stable3DGen/ckpts/slat_flow_normal_dit_L_64l8p2_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..22f500ab89867eb147f04d313cffac98e54ca163 --- /dev/null +++ b/Stable3DGen/ckpts/slat_flow_normal_dit_L_64l8p2_fp16.json @@ -0,0 +1,19 @@ +{ + "name": "SLatFlowModel", + "args": { + "resolution": 64, + "in_channels": 8, + "out_channels": 8, + "model_channels": 1024, + "cond_channels": 1024, + "num_blocks": 24, + "num_heads": 16, + "mlp_ratio": 4, + "patch_size": 2, + "num_io_res_blocks": 2, + "io_block_channels": [128], + "pe_mode": "ape", + "qk_rms_norm": true, + "use_fp16": true + } +} \ No newline at end of file diff --git a/Stable3DGen/ckpts/slat_flow_normal_dit_L_64l8p2_fp16.safetensors b/Stable3DGen/ckpts/slat_flow_normal_dit_L_64l8p2_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8c0df53527fdec2ddcb78907b8a231e34311f5c0 --- /dev/null +++ b/Stable3DGen/ckpts/slat_flow_normal_dit_L_64l8p2_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9a5896bba3b876e0560fc4a19c335d171b97502e7358b026260b76e0c4557dc +size 1200919136 diff --git a/Stable3DGen/ckpts/ss_dec_conv3d_16l8_fp16.json b/Stable3DGen/ckpts/ss_dec_conv3d_16l8_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..9f3affaf13ab29fe48105229da9fab72ea8de716 --- /dev/null +++ b/Stable3DGen/ckpts/ss_dec_conv3d_16l8_fp16.json @@ -0,0 +1,12 @@ + +{ + "name": "SparseStructureDecoder", + "args": { + "out_channels": 1, + "latent_channels": 8, + "num_res_blocks": 2, + "num_res_blocks_middle": 2, + "channels": [512, 128, 32], + "use_fp16": true + } +} \ No newline at end of file diff --git a/Stable3DGen/ckpts/ss_dec_conv3d_16l8_fp16.safetensors b/Stable3DGen/ckpts/ss_dec_conv3d_16l8_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a4b577752af35caab154c83b72427d9cc92285f6 --- /dev/null +++ b/Stable3DGen/ckpts/ss_dec_conv3d_16l8_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c76d4a40519aa2d711cc263a8404105231ac26db31d946bed48b84fee79009a +size 147591972 diff --git a/Stable3DGen/ckpts/ss_flow_normal_dit_L_16l8_fp16.json b/Stable3DGen/ckpts/ss_flow_normal_dit_L_16l8_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..d228f99cd828a54695ebd1f0fd1ad1b1f3179f42 --- /dev/null +++ b/Stable3DGen/ckpts/ss_flow_normal_dit_L_16l8_fp16.json @@ -0,0 +1,17 @@ +{ + "name": "SparseStructureFlowModel", + "args": { + "resolution": 16, + "in_channels": 8, + "out_channels": 8, + "model_channels": 1024, + "cond_channels": 1024, + "num_blocks": 24, + "num_heads": 16, + "mlp_ratio": 4, + "patch_size": 1, + "pe_mode": "ape", + "qk_rms_norm": true, + "use_fp16": true + } +} \ No newline at end of file diff --git a/Stable3DGen/ckpts/ss_flow_normal_dit_L_16l8_fp16.safetensors b/Stable3DGen/ckpts/ss_flow_normal_dit_L_16l8_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..51ec22cd0cdd6ea961b68f2a623a1cfbf0bac5da --- /dev/null +++ b/Stable3DGen/ckpts/ss_flow_normal_dit_L_16l8_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5f9f3aafead23fe5ee49ca01e57fcc0c3e0345635c05a0d83b39a7e5ccaf281 +size 1119525912 diff --git a/Stable3DGen/epoch=49-step=123100.ckpt b/Stable3DGen/epoch=49-step=123100.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..bc42eb82a4f7af134e80dd8d7f27f9d7c5a1146e --- /dev/null +++ b/Stable3DGen/epoch=49-step=123100.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c24b0e5f6d0adf9592848f728464c25bb6a476273617ef37a112e5d303e300f7 +size 2662776044 diff --git a/Stable3DGen/pipeline.json b/Stable3DGen/pipeline.json new file mode 100644 index 0000000000000000000000000000000000000000..2f4116221472c7f1c14296760b176d408813f8c2 --- /dev/null +++ b/Stable3DGen/pipeline.json @@ -0,0 +1,58 @@ +{ + "name": "TrellisImageTo3DPipeline", + "args": { + "models": { + "sparse_structure_decoder": "ckpts/ss_dec_conv3d_16l8_fp16", + "sparse_structure_flow_model": "ckpts/ss_flow_normal_dit_L_16l8_fp16", + "slat_decoder_mesh": "ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16", + "slat_flow_model": "ckpts/slat_flow_normal_dit_L_64l8p2_fp16" + }, + "sparse_structure_sampler": { + "name": "FlowEulerGuidanceIntervalSampler", + "args": { + "sigma_min": 1e-5 + }, + "params": { + "steps": 25, + "cfg_strength": 5.0, + "cfg_interval": [0.5, 1.0], + "rescale_t": 3.0 + } + }, + "slat_sampler": { + "name": "FlowEulerGuidanceIntervalSampler", + "args": { + "sigma_min": 1e-5 + }, + "params": { + "steps": 25, + "cfg_strength": 5.0, + "cfg_interval": [0.5, 1.0], + "rescale_t": 3.0 + } + }, + "slat_normalization": { + "mean": [ + -2.1687545776367188, + -0.004347046371549368, + -0.13352349400520325, + -0.08418072760105133, + -0.5271206498146057, + 0.7238689064979553, + -1.1414450407028198, + 1.2039363384246826 + ], + "std": [ + 2.377650737762451, + 2.386378288269043, + 2.124418020248413, + 2.1748552322387695, + 2.663944721221924, + 2.371192216873169, + 2.6217446327209473, + 2.684523105621338 + ] + }, + "image_cond_model": "dinov2_vitl14_reg" + } +} diff --git a/SuperGlue/superglue_indoor.pth b/SuperGlue/superglue_indoor.pth new file mode 100644 index 0000000000000000000000000000000000000000..969252133f802cb03256c15a3881b8b39c1867d4 --- /dev/null +++ b/SuperGlue/superglue_indoor.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e710469be25ebe1e2ccf68edcae8b2945b0617c8e7e68412251d9d47f5052b1 +size 48233807 diff --git a/SuperPoint/superpoint_v1.pth b/SuperPoint/superpoint_v1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7648726e3a3dfa2581e86bfa9c5a2a05cfb9bf74 --- /dev/null +++ b/SuperPoint/superpoint_v1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52b6708629640ca883673b5d5c097c4ddad37d8048b33f09c8ca0d69db12c40e +size 5206086 diff --git a/Trellis/.gitattributes b/Trellis/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/Trellis/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/Trellis/README.md b/Trellis/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd5bfeb8e375a40a22ee2d9693357e7f7433ef15 --- /dev/null +++ b/Trellis/README.md @@ -0,0 +1,9 @@ +--- +library_name: Hi3dGen_Color +pipeline_tag: image-to-3d +license: mit +language: +- en +--- + +An improved normal conditioned version of TRELLIS. diff --git a/Trellis/ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.json b/Trellis/ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..051ade1e3b374738bf7a007516275f72e458f2a7 --- /dev/null +++ b/Trellis/ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.json @@ -0,0 +1,31 @@ +{ + "name": "SLatGaussianDecoder", + "args": { + "resolution": 64, + "model_channels": 768, + "latent_channels": 8, + "num_blocks": 12, + "num_heads": 12, + "mlp_ratio": 4, + "attn_mode": "swin", + "window_size": 8, + "use_fp16": true, + "representation_config": { + "lr": { + "_xyz": 1.0, + "_features_dc": 1.0, + "_opacity": 1.0, + "_scaling": 1.0, + "_rotation": 0.1 + }, + "perturb_offset": true, + "voxel_size": 1.5, + "num_gaussians": 32, + "2d_filter_kernel_size": 0.1, + "3d_filter_kernel_size": 9e-4, + "scaling_bias": 4e-3, + "opacity_bias": 0.1, + "scaling_activation": "softplus" + } + } +} \ No newline at end of file diff --git a/Trellis/ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.safetensors b/Trellis/ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3a8ca2e522bee8e81fad8bb375064925a6b70ffb --- /dev/null +++ b/Trellis/ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38c84bcef5ce0af1f48b1b5558dabc7575a13346043c41a7e0610f1fa619a161 +size 171450952 diff --git a/Trellis/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json b/Trellis/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..28802825e9ba04bcc2cfdffb491dc9dbb72c8a38 --- /dev/null +++ b/Trellis/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.json @@ -0,0 +1,17 @@ +{ + "name": "SLatMeshDecoder", + "args": { + "resolution": 64, + "model_channels": 768, + "latent_channels": 8, + "num_blocks": 12, + "num_heads": 12, + "mlp_ratio": 4, + "attn_mode": "swin", + "window_size": 8, + "use_fp16": true, + "representation_config": { + "use_color": true + } + } +} \ No newline at end of file diff --git a/Trellis/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors b/Trellis/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..da28e99a05f40ab6f63172aaef5d695dfb1ab899 --- /dev/null +++ b/Trellis/ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e87aba94b5786407eb06d0502c1ed0885a0027a3f2b8537bfe15b0a92c01859 +size 181903412 diff --git a/Trellis/ckpts/slat_dec_rf_swin8_B_64l8r16_fp16.json b/Trellis/ckpts/slat_dec_rf_swin8_B_64l8r16_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..10676bd6322cb1519c9cb603ea77cfa2057ae45e --- /dev/null +++ b/Trellis/ckpts/slat_dec_rf_swin8_B_64l8r16_fp16.json @@ -0,0 +1,18 @@ +{ + "name": "SLatRadianceFieldDecoder", + "args": { + "resolution": 64, + "model_channels": 768, + "latent_channels": 8, + "num_blocks": 12, + "num_heads": 12, + "mlp_ratio": 4, + "attn_mode": "swin", + "window_size": 8, + "use_fp16": true, + "representation_config": { + "rank": 16, + "dim": 8 + } + } +} \ No newline at end of file diff --git a/Trellis/ckpts/slat_dec_rf_swin8_B_64l8r16_fp16.safetensors b/Trellis/ckpts/slat_dec_rf_swin8_B_64l8r16_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ff6f6c33640ba58478e9d99dec5eb3ba79de7d41 --- /dev/null +++ b/Trellis/ckpts/slat_dec_rf_swin8_B_64l8r16_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:737da6578d01948016b7c39786113af0d64a46f7922f6b8b5e698b84643be514 +size 171450488 diff --git a/Trellis/ckpts/slat_enc_swin8_B_64l8_fp16.json b/Trellis/ckpts/slat_enc_swin8_B_64l8_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..19b9b8c99c3b56eb9f07066d2a4bbad578b10291 --- /dev/null +++ b/Trellis/ckpts/slat_enc_swin8_B_64l8_fp16.json @@ -0,0 +1,15 @@ +{ + "name": "SLatEncoder", + "args": { + "resolution": 64, + "in_channels": 1024, + "model_channels": 768, + "latent_channels": 8, + "num_blocks": 12, + "num_heads": 12, + "mlp_ratio": 4, + "attn_mode": "swin", + "window_size": 8, + "use_fp16": true + } +} \ No newline at end of file diff --git a/Trellis/ckpts/slat_enc_swin8_B_64l8_fp16.safetensors b/Trellis/ckpts/slat_enc_swin8_B_64l8_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..725e180c4092c7ae067a5155f8a964f6e14788fd --- /dev/null +++ b/Trellis/ckpts/slat_enc_swin8_B_64l8_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21dceac6bee917ab6458ff52c9757ba89a779d03031c7bd17f9e7f0103bfd436 +size 173242816 diff --git a/Trellis/ckpts/slat_flow_img_dit_L_64l8p2_fp16.json b/Trellis/ckpts/slat_flow_img_dit_L_64l8p2_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..22f500ab89867eb147f04d313cffac98e54ca163 --- /dev/null +++ b/Trellis/ckpts/slat_flow_img_dit_L_64l8p2_fp16.json @@ -0,0 +1,19 @@ +{ + "name": "SLatFlowModel", + "args": { + "resolution": 64, + "in_channels": 8, + "out_channels": 8, + "model_channels": 1024, + "cond_channels": 1024, + "num_blocks": 24, + "num_heads": 16, + "mlp_ratio": 4, + "patch_size": 2, + "num_io_res_blocks": 2, + "io_block_channels": [128], + "pe_mode": "ape", + "qk_rms_norm": true, + "use_fp16": true + } +} \ No newline at end of file diff --git a/Trellis/ckpts/slat_flow_img_dit_L_64l8p2_fp16.safetensors b/Trellis/ckpts/slat_flow_img_dit_L_64l8p2_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a7d367bccf6fb75d9407675e3d8adc9e51307a03 --- /dev/null +++ b/Trellis/ckpts/slat_flow_img_dit_L_64l8p2_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:693fb2a58ad497bd222007301eeec49d14d60f8c12d2f2f00c221fa747b4c66c +size 1203755136 diff --git a/Trellis/ckpts/ss_dec_conv3d_16l8_fp16.json b/Trellis/ckpts/ss_dec_conv3d_16l8_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..9f3affaf13ab29fe48105229da9fab72ea8de716 --- /dev/null +++ b/Trellis/ckpts/ss_dec_conv3d_16l8_fp16.json @@ -0,0 +1,12 @@ + +{ + "name": "SparseStructureDecoder", + "args": { + "out_channels": 1, + "latent_channels": 8, + "num_res_blocks": 2, + "num_res_blocks_middle": 2, + "channels": [512, 128, 32], + "use_fp16": true + } +} \ No newline at end of file diff --git a/Trellis/ckpts/ss_dec_conv3d_16l8_fp16.safetensors b/Trellis/ckpts/ss_dec_conv3d_16l8_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a4b577752af35caab154c83b72427d9cc92285f6 --- /dev/null +++ b/Trellis/ckpts/ss_dec_conv3d_16l8_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c76d4a40519aa2d711cc263a8404105231ac26db31d946bed48b84fee79009a +size 147591972 diff --git a/Trellis/ckpts/ss_enc_conv3d_16l8_fp16.json b/Trellis/ckpts/ss_enc_conv3d_16l8_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..af493f4615283446a71d281eb4fb5d87cc32f159 --- /dev/null +++ b/Trellis/ckpts/ss_enc_conv3d_16l8_fp16.json @@ -0,0 +1,12 @@ + +{ + "name": "SparseStructureEncoder", + "args": { + "in_channels": 1, + "latent_channels": 8, + "num_res_blocks": 2, + "num_res_blocks_middle": 2, + "channels": [32, 128, 512], + "use_fp16": true + } +} \ No newline at end of file diff --git a/Trellis/ckpts/ss_enc_conv3d_16l8_fp16.safetensors b/Trellis/ckpts/ss_enc_conv3d_16l8_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6ebc7ddeca312299bedc83fdfa20055a265c8b00 --- /dev/null +++ b/Trellis/ckpts/ss_enc_conv3d_16l8_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:107874eeaa0feb82f51b19db5da7db534fb7e7f19e5a122b9ff1bc2e258bfc6d +size 119068016 diff --git a/Trellis/ckpts/ss_flow_img_dit_L_16l8_fp16.json b/Trellis/ckpts/ss_flow_img_dit_L_16l8_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..d228f99cd828a54695ebd1f0fd1ad1b1f3179f42 --- /dev/null +++ b/Trellis/ckpts/ss_flow_img_dit_L_16l8_fp16.json @@ -0,0 +1,17 @@ +{ + "name": "SparseStructureFlowModel", + "args": { + "resolution": 16, + "in_channels": 8, + "out_channels": 8, + "model_channels": 1024, + "cond_channels": 1024, + "num_blocks": 24, + "num_heads": 16, + "mlp_ratio": 4, + "patch_size": 1, + "pe_mode": "ape", + "qk_rms_norm": true, + "use_fp16": true + } +} \ No newline at end of file diff --git a/Trellis/ckpts/ss_flow_img_dit_L_16l8_fp16.safetensors b/Trellis/ckpts/ss_flow_img_dit_L_16l8_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2444705e7da1d8897e5cbd08c08001033f08cf2a --- /dev/null +++ b/Trellis/ckpts/ss_flow_img_dit_L_16l8_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96dc6bfd4136fd950af564dd16b4ae533c9ba6af8f26c670646b2a9f2789b1db +size 1130770840 diff --git a/Trellis/ckpts/ss_flow_normal_dit_L_16l8_fp16.json b/Trellis/ckpts/ss_flow_normal_dit_L_16l8_fp16.json new file mode 100644 index 0000000000000000000000000000000000000000..d228f99cd828a54695ebd1f0fd1ad1b1f3179f42 --- /dev/null +++ b/Trellis/ckpts/ss_flow_normal_dit_L_16l8_fp16.json @@ -0,0 +1,17 @@ +{ + "name": "SparseStructureFlowModel", + "args": { + "resolution": 16, + "in_channels": 8, + "out_channels": 8, + "model_channels": 1024, + "cond_channels": 1024, + "num_blocks": 24, + "num_heads": 16, + "mlp_ratio": 4, + "patch_size": 1, + "pe_mode": "ape", + "qk_rms_norm": true, + "use_fp16": true + } +} \ No newline at end of file diff --git a/Trellis/ckpts/ss_flow_normal_dit_L_16l8_fp16.safetensors b/Trellis/ckpts/ss_flow_normal_dit_L_16l8_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..51ec22cd0cdd6ea961b68f2a623a1cfbf0bac5da --- /dev/null +++ b/Trellis/ckpts/ss_flow_normal_dit_L_16l8_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5f9f3aafead23fe5ee49ca01e57fcc0c3e0345635c05a0d83b39a7e5ccaf281 +size 1119525912 diff --git a/Trellis/pipeline.json b/Trellis/pipeline.json new file mode 100644 index 0000000000000000000000000000000000000000..c260fb2dbf5c3cfb0b69ad96179158ad745ee25f --- /dev/null +++ b/Trellis/pipeline.json @@ -0,0 +1,59 @@ +{ + "name": "TrellisImageTo3DPipeline", + "args": { + "models": { + "sparse_structure_decoder": "ckpts/ss_dec_conv3d_16l8_fp16", + "sparse_structure_flow_model": "ckpts/ss_flow_normal_dit_L_16l8_fp16", + "slat_decoder_gs": "ckpts/slat_dec_gs_swin8_B_64l8gs32_fp16", + "slat_decoder_mesh": "ckpts/slat_dec_mesh_swin8_B_64l8m256c_fp16", + "slat_flow_model": "ckpts/slat_flow_img_dit_L_64l8p2_fp16" + }, + "sparse_structure_sampler": { + "name": "FlowEulerGuidanceIntervalSampler", + "args": { + "sigma_min": 1e-5 + }, + "params": { + "steps": 25, + "cfg_strength": 5.0, + "cfg_interval": [0.5, 1.0], + "rescale_t": 3.0 + } + }, + "slat_sampler": { + "name": "FlowEulerGuidanceIntervalSampler", + "args": { + "sigma_min": 1e-5 + }, + "params": { + "steps": 25, + "cfg_strength": 5.0, + "cfg_interval": [0.5, 1.0], + "rescale_t": 3.0 + } + }, + "slat_normalization": { + "mean": [ + -2.1687545776367188, + -0.004347046371549368, + -0.13352349400520325, + -0.08418072760105133, + -0.5271206498146057, + 0.7238689064979553, + -1.1414450407028198, + 1.2039363384246826 + ], + "std": [ + 2.377650737762451, + 2.386378288269043, + 2.124418020248413, + 2.1748552322387695, + 2.663944721221924, + 2.371192216873169, + 2.6217446327209473, + 2.684523105621338 + ] + }, + "image_cond_model": "dinov2_vitl14_reg" + } +} \ No newline at end of file