XuyaoWang commited on Apr 19

Commit

26de0fc

verified ·

1 Parent(s): 06a23d1

Upload folder using huggingface_hub

Browse files

Files changed (29) hide show

README.md +63 -0
action_processor.json +10 -0
action_processor/config.json +8 -0
config.json +9 -0
crossattn_adapter/config.json +5 -0
crossattn_adapter/model.safetensors +3 -0
lam/config.json +16 -0
lam/model.safetensors +3 -0
scheduler/scheduler_config.json +18 -0
text_encoder/.gitattributes +35 -0
text_encoder/README.md +377 -0
text_encoder/chat_template.json +3 -0
text_encoder/config.json +61 -0
text_encoder/generation_config.json +12 -0
text_encoder/model-00001-of-00004.safetensors +3 -0
text_encoder/model-00002-of-00004.safetensors +3 -0
text_encoder/model-00003-of-00004.safetensors +3 -0
text_encoder/model-00004-of-00004.safetensors +3 -0
text_encoder/model.safetensors.index.json +736 -0
text_encoder/preprocessor_config.json +19 -0
text_encoder/tokenizer.json +0 -0
text_encoder/tokenizer_config.json +207 -0
transformer/config.json +37 -0
transformer/model-00001-of-00003.safetensors +3 -0
transformer/model-00002-of-00003.safetensors +3 -0
transformer/model-00003-of-00003.safetensors +3 -0
transformer/model.safetensors.index.json +582 -0
vae/config.json +56 -0
vae/diffusion_pytorch_model.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+---
+license: other
+license_name: nvidia-open-model-license
+license_link: https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/
+tags:
+  - robotics
+  - video-generation
+  - diffusion
+  - action-conditioned
+  - dreamdojo
+  - cosmos-predict2.5
+library_name: diffusers
+pipeline_tag: video-to-video
+---
+# DreamDojo-AgiBot-2B-Diffusers
+Fine-tuned on AgiBot robot data. Part of the [DreamDojo](https://github.com/NVIDIA/DreamDojo) model family.
+| | |
+|---|---|
+| **Size** | 2B |
+| **Stage** | Post-training |
+| **Architecture** | DiT (Diffusion Transformer) with AdaLN-LoRA |
+| **Base** | Cosmos Predict 2.5 |
+## Checkpoint Structure
+```
+DreamDojo-AgiBot-2B-Diffusers/
+├── transformer/            # DiT backbone (sharded safetensors)
+├── crossattn_adapter/      # Text-to-DiT projection (100352 → 1024)
+├── vae/                    # AutoencoderKLWan (standard diffusers)
+├── lam/                    # Latent Action Model (710M params)
+├── text_encoder/           # Cosmos-Reason1-7B
+├── scheduler/              # FlowMatchEulerDiscreteScheduler
+├── action_processor/       # DreamDojo-specific config
+└── config.json
+```
+## Architecture
+|  | 2B |
+|--|------|
+| Model channels | 2048 |
+| Transformer blocks | 28 |
+| Attention heads | 16 |
+| Patch size (spatial / temporal) | 2 / 1 |
+| Action dim | 384 (unified) |
+## Citation
+```bibtex
+@article{dreamdojo2025,
+  title={DreamDojo: Advancing Real-World Robot Policies Through Generated Interactive Environments},
+  author={NVIDIA},
+  year={2025}
+}
+```
+## License
+Please refer to the [NVIDIA DreamDojo](https://github.com/NVIDIA/DreamDojo) repository for license terms.

action_processor.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_class_name": "DreamDojoActionProcessorConfig",
+  "_diffusers_version": "0.36.0",
+  "cfg_text_dropout": 0.2,
+  "cfg_video_dropout": 0.2,
+  "num_train_timesteps": 1000,
+  "seed": 42,
+  "shift": 5.0,
+  "train_time_distribution": "logitnormal"
+}

action_processor/config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_class_name": "DreamDojoActionProcessorConfig",
+  "_diffusers_version": "0.36.0",
+  "cfg_text_dropout": 0.2,
+  "cfg_video_dropout": 0.2,
+  "seed": 42,
+  "train_time_distribution": "logitnormal"
+}

config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_class_name": "DreamDojoPipeline",
+  "model_name": "DreamDojo-AgiBot-2B-Diffusers",
+  "crossattn_adapter": "crossattn_adapter",
+  "vae": "vae",
+  "lam": "lam",
+  "text_encoder": "text_encoder",
+  "transformer": "transformer"
+}

crossattn_adapter/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_type": "dreamdojo_crossattn_adapter",
+  "in_channels": 100352,
+  "out_channels": 1024
+}

crossattn_adapter/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:275389d9e49b3a31d08ee6fc9f657cf69b357df4c75ad8edea00a11110c3a4dd
+size 205523136

lam/config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "architectures": [
+    "DreamDojoLAM"
+  ],
+  "dec_blocks": 24,
+  "dropout": 0.0,
+  "dtype": "bfloat16",
+  "enc_blocks": 24,
+  "in_dim": 3,
+  "latent_dim": 32,
+  "model_dim": 1024,
+  "model_type": "dreamdojo_lam",
+  "num_heads": 16,
+  "patch_size": 16,
+  "transformers_version": "4.57.3"
+}

lam/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9a861d5ec2ded0283ea6f59fdf2fa8545b7e205ceb539f9188243dd8b16bd6
+size 1419658488

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.36.0",
+  "base_image_seq_len": 256,
+  "base_shift": 0.5,
+  "invert_sigmas": false,
+  "max_image_seq_len": 4096,
+  "max_shift": 1.15,
+  "num_train_timesteps": 1000,
+  "shift": 5.0,
+  "shift_terminal": null,
+  "stochastic_sampling": false,
+  "time_shift_type": "exponential",
+  "use_beta_sigmas": false,
+  "use_dynamic_shifting": false,
+  "use_exponential_sigmas": false,
+  "use_karras_sigmas": false
+}

text_encoder/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

text_encoder/README.md ADDED Viewed

	@@ -0,0 +1,377 @@

+---
+license: other
+license_name: nvidia-open-model-license
+license_link: >-
+  https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license
+datasets:
+- nvidia/Cosmos-Reason1-SFT-Dataset
+- nvidia/Cosmos-Reason1-RL-Dataset
+- nvidia/Cosmos-Reason1-Benchmark
+library_name: transformers
+language:
+- en
+base_model:
+- Qwen/Qwen2.5-VL-7B-Instruct
+tags:
+- nvidia
+- cosmos
+pipeline_tag: image-text-to-text
+---
+# **Cosmos-Reason1: Physical AI Common Sense and Embodied Reasoning Models**
+[**Cosmos**](https://huggingface.co/collections/nvidia/cosmos-reason1-67c9e926206426008f1da1b7) | [**Code**](https://github.com/nvidia-cosmos/cosmos-reason1) | [**Paper**](https://arxiv.org/abs/2503.15558) | [**Paper Website**](https://research.nvidia.com/labs/dir/cosmos-reason1)
+# Model Overview
+## Description:
+NVIDIA Cosmos Reason – an open, customizable, 7B-parameter reasoning vision language model (VLM) for physical AI and robotics - enables robots and vision AI agents to reason like humans, using prior knowledge, physics understanding and common sense to understand and act in the real world. This model understands space, time, and fundamental physics, and can serve as a planning model to reason what steps an embodied agent might take next.
+Cosmos Reason excels at navigating the long tail of diverse scenarios of the physical world with spatial-temporal understanding. Cosmos Reason is post-trained with physical common sense and embodied reasoning data with supervised fine-tuning and reinforcement learning. It uses chain-of-thought reasoning capabilities to understand world dynamics without human annotations.
+Given a video/image and a text prompt, the model first converts the video/image into tokens using a vision encoder and a special translator called a projector. These video tokens are combined with the text prompt and fed into the core model, which uses a mix of LLM modules and techniques. This enables the model to think step-by-step and provide detailed, logical responses.
+Cosmos Reason can be used for robotics and physical AI applications including:
+- Data curation and annotation — Enable developers to automate high-quality curation and annotation of massive, diverse training datasets.
+- Robot planning and reasoning — Act as the brain for deliberate, methodical decision-making in a robot vision language action (VLA) model. Now robots such as humanoids and autonomous vehicles can interpret environments and given complex commands, break them down into tasks and execute them using common sense, even in unfamiliar environments.
+- Video analytics AI agents — Extract valuable insights and perform root-cause analysis on massive volumes of video data. These agents can be used to analyze and understand recorded or live video streams across city and industrial operations.
+The model is ready for commercial use.
+**Model Developer**: NVIDIA
+## Model Versions
+The Cosmos-Reason1 includes the following model:
+- [Cosmos-Reason1-7B](https://huggingface.co/nvidia/Cosmos-Reason1-7B): Given a text prompt and an input video, think and generate the answer with respect to the input text prompt and video.
+### License:
+This model is released under the  [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). Additional Information: [Apache License 2.0](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md).
+For a custom license, please contact [cosmos-license@nvidia.com](mailto:cosmos-license@nvidia.com).
+Under the NVIDIA Open Model License, NVIDIA confirms:
+* Models are commercially usable.
+* You are free to create and distribute Derivative Models.
+* NVIDIA does not claim ownership to any outputs generated using the Models or Derivative Models.
+**Important Note**: If You bypass, disable, reduce the efficacy of, or circumvent any technical limitation, safety guardrail or associated safety guardrail hyperparameter, encryption, security, digital rights management, or authentication mechanism (collectively “Guardrail”) contained in the Model without a substantially similar Guardrail appropriate for your use case, your rights under this Agreement [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license) will automatically terminate.
+### Deployment Geography:
+Global
+### Use Case:
+Physical AI: Space, time, fundamental physics understanding and embodied reasoning, encompassing robotics, and autonomous vehicles (AV).
+### Release Date:
+* Github: [05/17/2025](https://github.com/nvidia-cosmos/cosmos-reason1)
+* Huggingface:
+  * [08/01/2025](https://huggingface.co/nvidia/Cosmos-Reason1-7B/commit/0caf724f837efea5e25bf6d5818dcdeec0a36604). Shipped a few improvements which include captions with temporal timestamp, Set of Mark prompting.
+  * [06/10/2025](https://huggingface.co/nvidia/Cosmos-Reason1-7B/commit/2464fff43c5c0bfb1916ac8c009feda4aed81be9). Enhanced critic capability for physical plausibility.
+  * [05/17/2025](https://huggingface.co/nvidia/Cosmos-Reason1-7B/commit/098a5bb62a1f4fc05e5c4ac89aae8005e301aa18). Initial release.
+## Model Architecture:
+Architecture Type: A Multi-modal LLM consists of a Vision Transformer (ViT) for vision encoder and a Dense Transformer model for LLM.
+Network Architecture: Qwen2.5-VL-7B-Instruct.
+Cosmos-Reason-7B is post-trained based on [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) and follows the same model architecture.
+**Number of model parameters:**
+Cosmos-Reason1-7B:<br>
+  * Vision Transformer (ViT): 675.76M (675,759,104)
+  * Language Model (LLM): 7.07B (7,070,619,136)
+  * Other components (output projection layer): 545.00M (544,997,376)
+## Input
+  **Input Type(s)**: Text+Video/Image
+   **Input Format(s)**:
+   * Text: String
+   * Video: mp4
+   * Image: jpg
+  **Input Parameters**:
+  * Text: One-dimensional (1D)
+  * Video: Three-dimensional (3D)
+  * Image: Two-dimensional (2D)
+  **Other Properties Related to Input**:
+  * Use `FPS=4` for input video to match the training setup.
+  * Append `Answer the question in the following format: <think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>.` in the system prompt to encourage long chain-of-thought reasoning response.
+## Output
+ **Output Type(s)**: Text
+ **Output Format**: String
+ **Output Parameters**: Text: One-dimensional (1D)
+ **Other Properties Related to Output**:
+ * Recommend using 4096 or more output max tokens to avoid truncation of long chain-of-thought response.
+ * Our AI model recognizes timestamps added at the bottom of each frame for accurate temporal localization.
+ * Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions. <br>
+## Software Integration
+**Runtime Engine(s):**
+* [vLLM](https://github.com/vllm-project/vllm)
+**Supported Hardware Microarchitecture Compatibility:**
+* NVIDIA Blackwell
+* NVIDIA Hopper
+**Note**: We have only tested doing inference with BF16 precision.
+**Operating System(s):**
+* Linux (We have not tested on other operating systems.)
+# Usage
+See [Cosmos-Reason1](https://github.com/nvidia-cosmos/cosmos-reason1) for details.
+* Post Training: [Cosmos-Reason1](https://github.com/nvidia-cosmos/cosmos-reason1) provides examples of supervised fine-tuning and reinforcement learning on embodied reasoning datasets.
+## Training and Evaluation Sections:
+### 05/17/2025
+Please see our [technical paper](https://arxiv.org/pdf/2503.15558) for detailed evaluations on physical common sense and embodied reasoning. Part of the evaluation datasets are released under [Cosmos-Reason1-Benchmark](https://huggingface.co/datasets/nvidia/Cosmos-Reason1-Benchmark). The embodied reasoning datasets and benchmarks focus on the following areas: robotics (RoboVQA, BridgeDataV2, Agibot, RobFail), ego-centric human demonstration (HoloAssist), and Autonomous Vehicle (AV) driving video data. The AV dataset is collected and annotated by NVIDIA.
+All datasets go through the data annotation process described in the technical paper to prepare training and evaluation data and annotations.
+### 08/01/2025
+We enhance the model capability with the augmented training data. PLM-Video-Human and Nexar are used to enable dense temporal captioning. Describe Anything is added to enhance a set of mark (SoM) prompting. We enrich data in intelligent transportation systems (ITS) and warehouse applications. Lastly, Visual Critics dataset contains a collection of AI generated videos from Cosmos-Predict2 and Wan2.1 with human annotations to describe the physical correctness in AI videos.
+## Training Datasets:
+**Data Collection Method**:
+* RoboVQA:  Hybrid:  Automatic/Sensors
+* BridgeDataV2: Automatic/Sensors
+* AgiBot: Automatic/Sensors
+* RoboFail: Automatic/Sensors
+* HoloAssist: Human
+* AV: Automatic/Sensors
+* PLM-Video-Human: Human
+* Nexar: Automatic/Sensors
+* Describe Anything: Human
+* ITS / Warehouse: Human, Automatic
+* Visual Critics: Automatic
+**Labeling Method**:
+* RoboVQA:  Hybrid:  Human,Automated
+* BridgeDataV2: Hybrid:  Human,Automated
+* AgiBot: Hybrid:  Human,Automated
+* RoboFail: Hybrid:  Human,Automated
+* HoloAssist: Hybrid:  Human,Automated
+* AV: Hybrid:  Human,Automated
+* PLM-Video-Human: Human,Automated
+* Nexar: Human
+* Describe Anything: Human,Automated
+* ITS / Warehouse: Human, Automated
+* Visual Critics: Human,Automated
+# Evaluation Datasets:
+**Data Collection Method**:
+* RoboVQA:  Hybrid:  Automatic/Sensors
+* BridgeDataV2: Automatic/Sensors
+* AgiBot: Automatic/Sensors
+* RoboFail: Automatic/Sensors
+* HoloAssist: Human
+* AV: Automatic/Sensors
+**Labeling Method**:
+* RoboVQA:  Hybrid:  Human,Automated
+* BridgeDataV2: Hybrid:  Human,Automated
+* AgiBot: Hybrid:  Human,Automated
+* RoboFail: Hybrid:  Human,Automated
+* HoloAssist: Hybrid:  Human,Automated
+* AV: Hybrid:  Human,Automated
+**Metrics**:
+We report the model accuracy on the embodied reasoning benchmark introduced in [Cosmos-Reason1](https://arxiv.org/abs/2503.15558). The results differ from those presented in Table 9 due to additional training aimed at supporting a broader range of Physical AI tasks beyond the benchmark.
+|                   | [RoboVQA](https://robovqa.github.io/)        | AV       | [BridgeDataV2](https://rail-berkeley.github.io/bridgedata/)| [Agibot](https://github.com/OpenDriveLab/AgiBot-World)| [HoloAssist](https://holoassist.github.io/)       | [RoboFail](https://robot-reflect.github.io/)                               | Average |
+|--------------------|---------------------------------------------|----------|------------------------------------------------------|------------------------------------------------|------------------------------------------------|------------------------------------------------|------------------------------------------------|
+| **Accuracy**        | 87.3                                        | 70.8    | 63.7                                                 | 48.9                                          | 62.7                                           | 57.2                                            |  65.1                                          |
+## Dataset Format
+Modality: Video (mp4) and Text
+## Dataset Quantification
+### 05/17/2025
+We release the embodied reasoning data and benchmarks. Each data sample is a pair of video and text. The text annotations include understanding and reasoning annotations described in the Cosmos-Reason1 paper. Each video may have multiple text annotations. The quantity of the video and text pairs is described in the table below.
+**The AV data is currently unavailable and will be uploaded soon!**
+|                   | [RoboVQA](https://robovqa.github.io/)        | AV       | [BridgeDataV2](https://rail-berkeley.github.io/bridgedata/)| [Agibot](https://github.com/OpenDriveLab/AgiBot-World)| [HoloAssist](https://holoassist.github.io/)       | [RoboFail](https://robot-reflect.github.io/)                               | Total Storage Size |
+|--------------------|---------------------------------------------|----------|------------------------------------------------------|------------------------------------------------|------------------------------------------------|------------------------------------------------|--------------------|
+| **SFT Data**        | 1.14m                                        | 24.7k    | 258k                                                 | 38.9k                                          | 273k                                           | N/A                                            | **300.6GB**         |
+| **RL Data**         | 252                                          | 200      | 240                                                  | 200                                            | 200                                            | N/A                                            | **2.6GB**           |
+| **Benchmark Data**  | 110                                          | 100      | 100                                                  | 100                                            | 100                                            | 100                                            | **1.5GB**           |
+We release text annotations for all embodied reasoning datasets and videos for RoboVQA and AV datasets. For other datasets, users may download the source videos from the original data source and find corresponding video sources via the video names. The held-out RoboFail benchmark is released for measuring the generalization capability.
+### 08/01/2025
+|                   | [PLM-Video-Human](https://huggingface.co/datasets/facebook/PLM-Video-Human) | Nexar       | [Describe Anything](https://huggingface.co/datasets/nvidia/describe-anything-dataset)| [ITS / Warehouse]       | Visual Critics                             | Total Storage Size |
+|------------------ |-----------------------------------------------------------------------------|-------------|--------------------------------------------------------------------------------------|-------------------------|--------------------------------------------|--------------------|
+| **SFT Data**      | 39k                                                                         | 240k        | 178k                                                                                 | 24k                     | 24k                                        | **2.6TB**          |
+## Inference:
+**Test Hardware:** H100, A100, GB200 <br>
+> [!NOTE]
+> We suggest using `fps=4` for the input video and `max_tokens=4096` to avoid truncated response.
+```python
+from transformers import AutoProcessor
+from vllm import LLM, SamplingParams
+from qwen_vl_utils import process_vision_info
+# You can also replace the MODEL_PATH by a safetensors folder path mentioned above
+MODEL_PATH = "nvidia/Cosmos-Reason1-7B"
+llm = LLM(
+    model=MODEL_PATH,
+    limit_mm_per_prompt={"image": 10, "video": 10},
+)
+sampling_params = SamplingParams(
+    temperature=0.6,
+    top_p=0.95,
+    repetition_penalty=1.05,
+    max_tokens=4096,
+)
+video_messages = [
+    {"role": "system", "content": "You are a helpful assistant. Answer the question in the following format: <think>\nyour reasoning\n</think>\n\n<answer>\nyour answer\n</answer>."},
+    {"role": "user", "content": [
+            {"type": "text", "text": (
+                    "Is it safe to turn right?"
+                )
+            },
+            {
+                "type": "video",
+                "video": "file:///path/to/your/video.mp4",
+                "fps": 4,
+            }
+        ]
+    },
+]
+# Here we use video messages as a demonstration
+messages = video_messages
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+prompt = processor.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+)
+image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
+mm_data = {}
+if image_inputs is not None:
+    mm_data["image"] = image_inputs
+if video_inputs is not None:
+    mm_data["video"] = video_inputs
+llm_inputs = {
+    "prompt": prompt,
+    "multi_modal_data": mm_data,
+    # FPS will be returned in video_kwargs
+    "mm_processor_kwargs": video_kwargs,
+}
+outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
+generated_text = outputs[0].outputs[0].text
+print(generated_text)
+```
+## Ethical Considerations
+NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications.  When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
+Users are responsible for model inputs and outputs. Users are responsible for ensuring safe integration of this model, including implementing guardrails as well as other safety mechanisms, prior to deployment.
+For more detailed information on ethical considerations for this model, please see the subcards of Explainability, Bias, Safety & Security, and Privacy below.
+Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).
+### Plus Plus (++) Promise
+We value you, the datasets, the diversity they represent, and what we have been entrusted with. This model and its associated data have been:
+* Verified to comply with current applicable disclosure laws, regulations, and industry standards.
+* Verified to comply with applicable privacy labeling requirements.
+* Annotated to describe the collector/source (NVIDIA or a third-party).
+* Characterized for technical limitations.
+* Reviewed to ensure proper disclosure is accessible to, maintained for, and in compliance with NVIDIA data subjects and their requests.
+* Reviewed before release.
+* Tagged for known restrictions and potential safety implications.
+### Bias
+| Field                                                                                                                                                            | Response |
+| :--------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------- |
+| Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing: | None     |
+| Measures taken to mitigate against unwanted bias:                                                                                                                | The training video sources contain multiple physical embodiments and environments including human, car, single arm robot, bimanual robot in indoor and outdoor environments. By training on numerous and various physical interactions and curated datasets, we strive to provide a model that does not possess biases towards certain embodiments or environments.   |
+### Explainability
+| Field                                                     | Response                                                                                                             |
+| :-------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------- |
+| Intended Application & Domain:                            | Physical AI Reasoning                                                                                                |
+| Model Type:                                               | Transformer                                                                                                          |
+| Intended Users:                                           | Physical AI developers                                                                                               |
+| Output:                                                   | Text                                                                                                                 |
+| Describe how the model works:                             | Given a video/image and a text prompt, the model first converts the video/image into tokens using a vision encoder and a special translator called a projector. These video tokens are combined with the text prompt and fed into the core model, which uses a mix of LLM modules and techniques. This enables the model to think step-by-step and provide detailed, logical responses.                                                         |
+| Technical Limitations:                                    | The model may not follow the video or text input accurately in challenging cases, where the input video shows complex scene composition and temporal dynamics. Examples of challenging scenes include: fast camera movements, overlapping human-object interactions, low lighting with high motion blur, and multiple people performing different actions simultaneously.  |
+| Verified to have met prescribed NVIDIA quality standards: | Yes                                                                                                                  |
+| Performance Metrics:                                      | Quantitative and Qualitative Evaluation. Cosmos-Reason1 proposes the embodied reasoning benchmark and physical common sense benchmark to evaluate accuracy with visual question answering.                                                                              |
+| Potential Known Risks:                                    | The model's output can generate all forms of texts, including what may be considered toxic, offensive, or indecent.  |
+| Licensing:                                                | [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). Additional Information: [Apache License 2.0](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md).   |
+### Privacy
+| Field                                                               | Response       |
+| :------------------------------------------------------------------ | :------------- |
+| Generatable or reverse engineerable personal information?           | None Known     |
+| Protected class data used to create this model?                     | None Known     |
+| Was consent obtained for any personal data used?                    | None Known     |
+| How often is dataset reviewed?                                      | Before Release |
+| Is there provenance for all datasets used in training?              | Yes            |
+| Does data labeling (annotation, metadata) comply with privacy laws? | Yes            |
+| Applicable Privacy Policy | [NVIDIA Privacy Policy](https://www.nvidia.com/en-us/about-nvidia/privacy-policy)            |
+### Safety
+| Field                                           | Response                                                                                                                                                                                                                                                                                                                             |
+| :---------------------------------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Model Application(s):                           | Physical AI common sense understanding and embodied reasoning                                                                                                                                                                                                                                                                                                                     |
+| Describe the life critical impact (if present). | None Known                                                                                                                                                                                                                                                                                                                           |
+| Use Case Restrictions:                          | [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). Additional Information: [Apache License 2.0](https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md).                                                                                                                                                                                                                   |
+| Model and dataset restrictions:                 | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development.  Restrictions enforce dataset access during training, and dataset license constraints adhered to. Model checkpoints are made available on Hugging Face, and may become available on cloud providers' model catalog. |

text_encoder/chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "architectures": [
+    "Qwen2_5_VLForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "vision_start_token_id": 151652,
+  "vision_end_token_id": 151653,
+  "vision_token_id": 151654,
+  "image_token_id": 151655,
+  "video_token_id": 151656,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 28,
+  "model_type": "qwen2_5_vl",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.41.2",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vision_config": {
+    "depth": 32,
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "intermediate_size": 3420,
+    "num_heads": 16,
+    "in_chans": 3,
+    "out_hidden_size": 3584,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "window_size": 112,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "tokens_per_second": 2,
+    "temporal_patch_size": 2
+  },
+  "rope_scaling": {
+    "type": "mrope",
+    "mrope_section": [
+      16,
+      24,
+      24
+    ]
+  },
+  "vocab_size": 152064
+}

text_encoder/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 151643,
+  "pad_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "repetition_penalty": 1.05,
+  "temperature": 0.000001,
+  "transformers_version": "4.37.0"
+}

text_encoder/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c28404126221997ae8eb70a23b919c96174d42e35ae1d537e0c95093d50b359a
+size 4968243304

text_encoder/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f281081864c10992d3e03874c79d526c84407e049d713747f19eb9c79cd16db3
+size 4991495816

text_encoder/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb3a62a8d0e83c6283388ddea99395b221f908f9181b8edd0f7f91d02260ebe
+size 4932751040

text_encoder/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91bacbe1ad798e16daa05023b4e4bec70b53c8cd7d757db86c5bc76c4e0bbf15
+size 1691924384

text_encoder/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,736 @@

+{
+  "metadata": {
+    "total_size": 16584333312
+  },
+  "weight_map": {
+    "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
+    "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
+    "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.norm.weight": "model-00004-of-00004.safetensors",
+    "lm_head.weight": "model-00004-of-00004.safetensors"
+  }
+}

text_encoder/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "min_pixels": 3136,
+  "max_pixels": 12845056,
+  "patch_size": 14,
+  "temporal_patch_size": 2,
+  "merge_size": 2,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "processor_class": "Qwen2_5_VLProcessor"
+}

text_encoder/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text_encoder/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "add_bos_token": false
+}

transformer/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "action_dim": 384,
+  "adaln_lora_dim": 256,
+  "base_fps": 24,
+  "concat_padding_mask": true,
+  "condition_model_type": "DreamDojoConditionModel",
+  "crossattn_emb_channels": 1024,
+  "crossattn_proj_in_channels": 100352,
+  "enable_fps_modulation": true,
+  "eps": 1e-06,
+  "hidden_dim_in_action_embedder": null,
+  "in_channels": 16,
+  "max_frames": 128,
+  "max_img_h": 240,
+  "max_img_w": 240,
+  "mlp_ratio": 4.0,
+  "model_channels": 2048,
+  "model_type": "dreamdojo",
+  "motion_consistency_dim": 1,
+  "motion_consistency_weight": 0.1,
+  "num_action_per_latent_frame": 4,
+  "num_blocks": 28,
+  "num_heads": 16,
+  "out_channels": 16,
+  "patch_spatial": 2,
+  "patch_temporal": 1,
+  "rope_h_extrapolation_ratio": 1.0,
+  "rope_t_extrapolation_ratio": 1.0,
+  "rope_w_extrapolation_ratio": 1.0,
+  "shift": 5.0,
+  "tie_word_embeddings": false,
+  "timestep_scale": 0.001,
+  "transformers_version": "4.57.3",
+  "use_adaln_lora": true,
+  "use_crossattn_projection": true,
+  "zero_init_action_embedder": false
+}

transformer/model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c13a2a094fb87edd5d65b5be2313e3a1938c98fce76013297874be39025c2df
+size 1996830696

transformer/model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6aa46421383876a2cbbf0d94aa409ddb38881b73a01f77905d3f9bfc56cb879f
+size 1999983472

transformer/model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50b3358298eaafb3d563f3aac3e4c560e7e93f0d6dcc2ee3187464c376623cb8
+size 100675824

transformer/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,582 @@

+{
+  "metadata": {
+    "total_size": 4097425408
+  },
+  "weight_map": {
+    "x_embedder.proj.1.weight": "model-00001-of-00003.safetensors",
+    "t_embedder.1.linear_1.weight": "model-00001-of-00003.safetensors",
+    "t_embedder.1.linear_2.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.0.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.1.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.2.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.3.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.4.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.5.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.6.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.7.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.8.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.9.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.10.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.11.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.12.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.self_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.cross_attn.output_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.mlp.layer1.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.mlp.layer2.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.adaln_modulation_self_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.adaln_modulation_self_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.adaln_modulation_cross_attn.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.adaln_modulation_cross_attn.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.adaln_modulation_mlp.1.weight": "model-00001-of-00003.safetensors",
+    "blocks.13.adaln_modulation_mlp.2.weight": "model-00001-of-00003.safetensors",
+    "blocks.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.14.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.14.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
+    "blocks.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "blocks.14.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.14.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.15.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.16.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.17.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.18.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.19.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.20.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.21.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.22.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.23.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.24.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.25.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.26.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.self_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.q_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.k_norm.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.cross_attn.output_proj.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.mlp.layer1.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.mlp.layer2.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.adaln_modulation_self_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.adaln_modulation_self_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.adaln_modulation_cross_attn.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.adaln_modulation_cross_attn.2.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.adaln_modulation_mlp.1.weight": "model-00002-of-00003.safetensors",
+    "blocks.27.adaln_modulation_mlp.2.weight": "model-00002-of-00003.safetensors",
+    "final_layer.linear.weight": "model-00002-of-00003.safetensors",
+    "final_layer.adaln_modulation.1.weight": "model-00002-of-00003.safetensors",
+    "final_layer.adaln_modulation.2.weight": "model-00002-of-00003.safetensors",
+    "t_embedding_norm.weight": "model-00002-of-00003.safetensors",
+    "action_embedder_B_D.fc1.weight": "model-00002-of-00003.safetensors",
+    "action_embedder_B_D.fc1.bias": "model-00002-of-00003.safetensors",
+    "action_embedder_B_D.fc2.weight": "model-00002-of-00003.safetensors",
+    "action_embedder_B_D.fc2.bias": "model-00002-of-00003.safetensors",
+    "action_embedder_B_3D.fc1.weight": "model-00002-of-00003.safetensors",
+    "action_embedder_B_3D.fc1.bias": "model-00002-of-00003.safetensors",
+    "action_embedder_B_3D.fc2.weight": "model-00003-of-00003.safetensors",
+    "action_embedder_B_3D.fc2.bias": "model-00003-of-00003.safetensors"
+  }
+}

vae/config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "_class_name": "AutoencoderKLWan",
+  "_diffusers_version": "0.33.0.dev0",
+  "attn_scales": [],
+  "base_dim": 96,
+  "dim_mult": [
+    1,
+    2,
+    4,
+    4
+  ],
+  "dropout": 0.0,
+  "latents_mean": [
+    -0.7571,
+    -0.7089,
+    -0.9113,
+    0.1075,
+    -0.1745,
+    0.9653,
+    -0.1517,
+    1.5508,
+    0.4134,
+    -0.0715,
+    0.5517,
+    -0.3632,
+    -0.1922,
+    -0.9497,
+    0.2503,
+    -0.2921
+  ],
+  "latents_std": [
+    2.8184,
+    1.4541,
+    2.3275,
+    2.6558,
+    1.2196,
+    1.7708,
+    2.6052,
+    2.0743,
+    3.2687,
+    2.1526,
+    2.8652,
+    1.5579,
+    1.6382,
+    1.1253,
+    2.8251,
+    1.916
+  ],
+  "num_res_blocks": 2,
+  "temperal_downsample": [
+    false,
+    true,
+    true
+  ],
+  "z_dim": 16
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6e524b3fffede1787a74e81b30976dce5400c4439ba64222168e607ed19e793
+size 507591892