aka7774 commited on
Commit
7d8ed46
·
verified ·
1 Parent(s): 53805a4

Upload 43 files

Browse files
Files changed (44) hide show
  1. .gitattributes +16 -0
  2. aratako_tts/Anime-XCodec2-44.1kHz-v2/.gitattributes +35 -0
  3. aratako_tts/Anime-XCodec2-44.1kHz-v2/README.md +119 -0
  4. aratako_tts/Anime-XCodec2-44.1kHz-v2/ckpt/final.ckpt +3 -0
  5. aratako_tts/Anime-XCodec2-44.1kHz-v2/config.json +22 -0
  6. aratako_tts/Anime-XCodec2-44.1kHz-v2/model.safetensors +3 -0
  7. aratako_tts/Anime-XCodec2-44.1kHz-v2/xcodec2-0.1.7.tar.gz +3 -0
  8. aratako_tts/T5Gemma-TTS-2b-2b/.gitattributes +50 -0
  9. aratako_tts/T5Gemma-TTS-2b-2b/GEMMA_PROHIBITED_USE_POLICY.md +38 -0
  10. aratako_tts/T5Gemma-TTS-2b-2b/GEMMA_TERMS_OF_USE.md +124 -0
  11. aratako_tts/T5Gemma-TTS-2b-2b/NOTICE +1 -0
  12. aratako_tts/T5Gemma-TTS-2b-2b/README.md +170 -0
  13. aratako_tts/T5Gemma-TTS-2b-2b/README_ja.md +162 -0
  14. aratako_tts/T5Gemma-TTS-2b-2b/architecture.png +3 -0
  15. aratako_tts/T5Gemma-TTS-2b-2b/ckpt/pretrained.pth +3 -0
  16. aratako_tts/T5Gemma-TTS-2b-2b/config.json +349 -0
  17. aratako_tts/T5Gemma-TTS-2b-2b/configuration_t5gemma_voice.py +119 -0
  18. aratako_tts/T5Gemma-TTS-2b-2b/generation_config.json +7 -0
  19. aratako_tts/T5Gemma-TTS-2b-2b/model-00001-of-00003.safetensors +3 -0
  20. aratako_tts/T5Gemma-TTS-2b-2b/model-00002-of-00003.safetensors +3 -0
  21. aratako_tts/T5Gemma-TTS-2b-2b/model-00003-of-00003.safetensors +3 -0
  22. aratako_tts/T5Gemma-TTS-2b-2b/model.safetensors.index.json +744 -0
  23. aratako_tts/T5Gemma-TTS-2b-2b/modeling_t5gemma_voice.py +833 -0
  24. aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample1.wav +3 -0
  25. aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample2_fast.wav +0 -0
  26. aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample2_normal.wav +3 -0
  27. aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample2_slow.wav +3 -0
  28. aratako_tts/T5Gemma-TTS-2b-2b/samples/gen_sample1.wav +3 -0
  29. aratako_tts/T5Gemma-TTS-2b-2b/samples/gen_sample2.wav +3 -0
  30. aratako_tts/T5Gemma-TTS-2b-2b/samples/gen_sample3.wav +3 -0
  31. aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample1.wav +3 -0
  32. aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample2_fast.wav +3 -0
  33. aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample2_normal.wav +3 -0
  34. aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample2_slow.wav +3 -0
  35. aratako_tts/T5Gemma-TTS-2b-2b/samples/ref_sample1.wav +3 -0
  36. aratako_tts/T5Gemma-TTS-2b-2b/samples/ref_sample2.wav +3 -0
  37. aratako_tts/T5Gemma-TTS-2b-2b/samples/ref_sample3.wav +3 -0
  38. aratako_tts/T5Gemma-TTS-2b-2b/samples/zh_sample1.wav +3 -0
  39. aratako_tts/t5gemma-tokenizer/config.json +126 -0
  40. aratako_tts/t5gemma-tokenizer/generation_config.json +10 -0
  41. aratako_tts/t5gemma-tokenizer/special_tokens_map.json +34 -0
  42. aratako_tts/t5gemma-tokenizer/tokenizer.json +3 -0
  43. aratako_tts/t5gemma-tokenizer/tokenizer.model +3 -0
  44. aratako_tts/t5gemma-tokenizer/tokenizer_config.json +2014 -0
.gitattributes CHANGED
@@ -33,3 +33,19 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ aratako_tts/t5gemma-tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ aratako_tts/T5Gemma-TTS-2b-2b/architecture.png filter=lfs diff=lfs merge=lfs -text
38
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample1.wav filter=lfs diff=lfs merge=lfs -text
39
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample2_normal.wav filter=lfs diff=lfs merge=lfs -text
40
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample2_slow.wav filter=lfs diff=lfs merge=lfs -text
41
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/gen_sample1.wav filter=lfs diff=lfs merge=lfs -text
42
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/gen_sample2.wav filter=lfs diff=lfs merge=lfs -text
43
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/gen_sample3.wav filter=lfs diff=lfs merge=lfs -text
44
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample1.wav filter=lfs diff=lfs merge=lfs -text
45
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample2_fast.wav filter=lfs diff=lfs merge=lfs -text
46
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample2_normal.wav filter=lfs diff=lfs merge=lfs -text
47
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample2_slow.wav filter=lfs diff=lfs merge=lfs -text
48
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/ref_sample1.wav filter=lfs diff=lfs merge=lfs -text
49
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/ref_sample2.wav filter=lfs diff=lfs merge=lfs -text
50
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/ref_sample3.wav filter=lfs diff=lfs merge=lfs -text
51
+ aratako_tts/T5Gemma-TTS-2b-2b/samples/zh_sample1.wav filter=lfs diff=lfs merge=lfs -text
aratako_tts/Anime-XCodec2-44.1kHz-v2/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
aratako_tts/Anime-XCodec2-44.1kHz-v2/README.md ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ language:
4
+ - ja
5
+ base_model:
6
+ - NandemoGHS/Anime-XCodec2
7
+ pipeline_tag: audio-to-audio
8
+ tags:
9
+ - audio-to-audio
10
+ - speech
11
+ ---
12
+ # Anime-XCodec2-44.1kHz-v2: A 44.1kHz Upsampling Variant of Anime-XCodec2 (v2)
13
+
14
+ [![License: CC BY-NC 4.0](https://img.shields.io/badge/License-CC%20BY--NC%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc/4.0/)
15
+
16
+ **TL;DR**: `Anime-XCodec2-44.1kHz-v2` is a fine-tuned variant of **NandemoGHS/Anime-XCodec2**. It incorporates **upsampling layers** and **RMS loss** (inspired by **Inworld TTS-1**) to produce **44.1kHz** output, trained on ~22k hours of Japanese speech. This v2 updates upsampler parameters, loss configurations, and fixes a RoPE bug from the original XCodec2.
17
+
18
+ Only the **decoder** was updated; the **encoder and codebook remain frozen**, so **speech tokens are identical to the original XCodec2**. This makes the model a drop‑in decoder for downstream systems that already work with XCodec2 tokens (*e.g., Llasa*).
19
+
20
+ ---
21
+
22
+ ## 🔗 Quick Links
23
+
24
+ * **Demo (Gradio / Hugging Face Spaces)**: [https://huggingface.co/spaces/OmniAICreator/Anime-XCodec2-44.1kHz-v2-Demo](https://huggingface.co/spaces/OmniAICreator/Anime-XCodec2-44.1kHz-v2-Demo)
25
+ * **This repository (v2 44.1kHz fine-tune)**: `NandemoGHS/Anime-XCodec2-44.1kHz-v2`
26
+ * **Baseline 16kHz model**: `NandemoGHS/Anime-XCodec2`
27
+ * **Original XCodec2**: `HKUSTAudio/xcodec2`
28
+ * **Reference Paper (Inworld TTS-1)**: [https://arxiv.org/abs/2507.21138](https://arxiv.org/abs/2507.21138)
29
+ * **Reference Implementation (Inworld TTS)**: [https://github.com/inworld-ai/tts](https://github.com/inworld-ai/tts)
30
+
31
+ ---
32
+
33
+ ## 1) Model Summary
34
+
35
+ * **What it is**: A neural speech codec based on **Anime-XCodec2** (which is based on XCodec2), fine-tuned to output **44.1kHz** high-fidelity Japanese speech (anime/game-style). (Version 2)
36
+ * **Key Change**: Integrates an **UpSamplerBlock** and utilizes **RMS Loss** (inspired by [Inworld TTS-1](https://arxiv.org/abs/2507.21138)) into the decoder architecture.
37
+ * **Training scope**: **Decoder-only** fine-tuning on ~**22,000 hours** of Japanese data. **Encoder** and **codebook** are **frozen**.
38
+ * **Compatibility**: **Speech tokens are identical** to `HKUSTAudio/xcodec2` and `NandemoGHS/Anime-XCodec2`.
39
+ * **Input Sampling rate**: **16 kHz** (for encoding, same as XCodec2).
40
+ * **Output Sampling rate**: **44.1 kHz** (decoded audio).
41
+
42
+ ---
43
+
44
+ ## 2) Intended Use
45
+
46
+ * **Decode XCodec2 speech tokens** (e.g., from Llasa or other AR generators) into **high-fidelity 44.1kHz Japanese speech** (anime/game-style).
47
+ * Upgrade existing `Anime-XCodec2` (16kHz) pipelines to 44.1kHz output.
48
+ * **Audio Super-Resolution**: As the model accepts 16kHz input and outputs 44.1kHz reconstructed audio, it can also be used as a form of audio super-resolution. However, its performance for this specific purpose is untested/unevaluated.
49
+
50
+ ---
51
+
52
+ ## 3) How to Use (Important)
53
+
54
+ This model **modifies the original XCodec2 architecture** (upsampler blocks) and **requires a custom library version** that includes a fix for the RoPE bug ([Issue #36](https://github.com/zhenye234/X-Codec-2.0/issues/36)).
55
+
56
+ You **MUST** use the provided custom `xcodec2` library fork (v0.1.7 or later) for inference. The standard library or older custom libraries (like 0.1.6) **will not work**.
57
+
58
+ * **Installation:**
59
+ ```bash
60
+ # Install the custom xcodec2 library (v0.1.7)
61
+ pip install https://huggingface.co/NandemoGHS/Anime-XCodec2-44.1kHz-v2/resolve/main/xcodec2-0.1.7.tar.gz
62
+ ```
63
+
64
+ * **Usage:**
65
+ Once the custom library is installed, you can load and use this model just as you would the original XCodec2 or Anime-XCodec2 models. The core inference logic remains the same.
66
+
67
+ For a complete, working code example, please refer to my Hugging Face Spaces Demo: https://huggingface.co/spaces/OmniAICreator/Anime-XCodec2-44.1kHz-v2-Demo
68
+
69
+ ---
70
+
71
+ ## 4) Limitations & Trade-offs
72
+
73
+ * **Language scope**: Optimized for **Japanese**. Performance on other languages may degrade.
74
+ * **Content domain**: Tuned toward **anime/game-style** voices.
75
+ * **Library Dependency**: **Requires the specific custom `xcodec2` library (v0.1.7)** linked above. It is not compatible with the original `xcodec2` library or previous custom forks (e.g., v0.1.6).
76
+
77
+ ---
78
+
79
+ ## 5) Data (High-Level)
80
+
81
+ * ~**22,000 hours** of Japanese speech, with a focus on **anime/game-style voices**.
82
+ * Data was prepared for 44.1kHz target output during training.
83
+
84
+ ---
85
+
86
+ ## 6) Training Procedure (High-Level)
87
+
88
+ * **Base Model**: `NandemoGHS/Anime-XCodec2` (16kHz)
89
+ * **Architecture Modification**:
90
+ * Integrated the `UpSamplerBlock` from the [Inworld TTS-1 implementation](https://github.com/inworld-ai/tts) into the decoder.
91
+ * **Loss Function**:
92
+ * Adopted **RMS Loss** (Root Mean Square loss) (from Inworld TTS-1), in addition to original losses.
93
+ * **Frozen**: Encoder and Codebook (token compatibility preserved).
94
+ * **Updated (fine-tuned)**: `generator.backbone`, `generator.head`, `generator.upsampler`, `fc_post_a`
95
+
96
+ ### Key Updates in v2
97
+
98
+ Compared to the first version, this v2 model includes the following key updates to the training configuration:
99
+
100
+ 1. **RoPE Bug Fix**: Corrected a RoPE (Rotary Position Embedding) bug present in the original XCodec2 implementation (See [Issue #36](https://github.com/zhenye234/X-Codec-2.0/issues/36)).
101
+ 2. **Upsampler Parameters**: The upsampler settings were changed to `hop_length=98`, `upsample_factors=[3, 3]`, and `kernel_sizes=[9, 9]`.
102
+ 3. **Perceptual Loss Model**: The model used for calculating perceptual loss was switched from [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) to [imprt/kushinada-hubert-large](https://huggingface.co/imprt/kushinada-hubert-large).
103
+ 4. **Spectral Discriminator Tuning**: The STFT (Short-Time Fourier Transform) settings for the spectral discriminator were adjusted to be more suitable for 44.1kHz high-sampling-rate audio.
104
+
105
+ ---
106
+
107
+ ## 7) License
108
+
109
+ * **CC-BY-NC 4.0** (inherited from XCodec2 and Anime-XCodec2).
110
+ * See: [https://creativecommons.org/licenses/by-nc/4.0/](https://creativecommons.org/licenses/by-nc/4.0/)
111
+
112
+ ---
113
+
114
+ ## 8) Acknowledgements
115
+
116
+ * **HKUSTAudio/xcodec2** (Original model)
117
+ * **Inworld AI** for their work on [Inworld TTS-1](https://arxiv.org/abs/2507.21138) (Upsampler architecture and RMS Loss).
118
+ * **imprt** for the `kushinada-hubert-large` model used in perceptual loss.
119
+ * Thanks to contributors and the community around Japanese speech resources.
aratako_tts/Anime-XCodec2-44.1kHz-v2/ckpt/final.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:037db0b34dd734a805ad7a3dd1d9a2430b2690042631d5e5f6e6cc2028df0e0c
3
+ size 6537383183
aratako_tts/Anime-XCodec2-44.1kHz-v2/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XCodec2Model"
4
+ ],
5
+ "codec_decoder_hidden_size": 1024,
6
+ "codec_encoder_hidden_size": 1024,
7
+ "model_type": "xcodec2",
8
+ "semantic_hidden_size": 1024,
9
+ "torch_dtype": "float32",
10
+ "transformers_version": "4.48.0",
11
+ "use_vocos": true,
12
+ "hop_length": 98,
13
+ "sample_rate": 44100,
14
+ "upsample_factors": [
15
+ 3,
16
+ 3
17
+ ],
18
+ "upsample_kernel_sizes": [
19
+ 9,
20
+ 9
21
+ ]
22
+ }
aratako_tts/Anime-XCodec2-44.1kHz-v2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9c4ba4d0261d9ec1db0e323273c3e05d68725a055673874551ce78365822352
3
+ size 3320002448
aratako_tts/Anime-XCodec2-44.1kHz-v2/xcodec2-0.1.7.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ae197588f5621f521c7a715d0d7f8985de9a5921d1644c7f0a8078d2c5ade2f
3
+ size 23367
aratako_tts/T5Gemma-TTS-2b-2b/.gitattributes ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ samples/en_sample1.wav filter=lfs diff=lfs merge=lfs -text
37
+ samples/en_sample2_normal.wav filter=lfs diff=lfs merge=lfs -text
38
+ samples/en_sample2_slow.wav filter=lfs diff=lfs merge=lfs -text
39
+ samples/gen_sample1.wav filter=lfs diff=lfs merge=lfs -text
40
+ samples/gen_sample2.wav filter=lfs diff=lfs merge=lfs -text
41
+ samples/gen_sample3.wav filter=lfs diff=lfs merge=lfs -text
42
+ samples/jp_sample1.wav filter=lfs diff=lfs merge=lfs -text
43
+ samples/jp_sample2_fast.wav filter=lfs diff=lfs merge=lfs -text
44
+ samples/jp_sample2_normal.wav filter=lfs diff=lfs merge=lfs -text
45
+ samples/jp_sample2_slow.wav filter=lfs diff=lfs merge=lfs -text
46
+ samples/ref_sample1.wav filter=lfs diff=lfs merge=lfs -text
47
+ samples/ref_sample2.wav filter=lfs diff=lfs merge=lfs -text
48
+ samples/ref_sample3.wav filter=lfs diff=lfs merge=lfs -text
49
+ samples/zh_sample1.wav filter=lfs diff=lfs merge=lfs -text
50
+ architecture.png filter=lfs diff=lfs merge=lfs -text
aratako_tts/T5Gemma-TTS-2b-2b/GEMMA_PROHIBITED_USE_POLICY.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Gemma Prohibited Use Policy
2
+
3
+ Google reserves the right to update this Gemma Prohibited Use Policy from time
4
+ to time.
5
+
6
+ Last modified: February 21, 2024
7
+
8
+ You **may not** use nor allow others to use Gemma or Model Derivatives to:
9
+
10
+ 1. Generate any content, including the outputs or results generated by Gemma or Model Derivatives, that infringes, misappropriates, or otherwise violates any individual's or entity's rights (including, but not limited to rights in copyrighted content).
11
+ 2. Perform or facilitate dangerous, illegal, or malicious activities, including:
12
+ 1. Facilitation or promotion of illegal activities or violations of law, such as:
13
+ 1. Promoting or generating content related to child sexual abuse or exploitation;
14
+ 2. Promoting or facilitating sale of, or providing instructions for synthesizing or accessing, illegal substances, goods, or services;
15
+ 3. Facilitating or encouraging users to commit any type of crimes; or
16
+ 4. Promoting or generating violent extremism or terrorist content.
17
+ 2. Engagement in the illegal or unlicensed practice of any vocation or profession including, but not limited to, legal, medical, accounting, or financial professional practices.
18
+ 3. Abuse, harm, interference, or disruption of services (or enable others to do the same), such as:
19
+ 1. Promoting or facilitating the generation or distribution of spam; or
20
+ 2. Generating content for deceptive or fraudulent activities, scams, phishing, or malware.
21
+ 4. Attempts to override or circumvent safety filters or intentionally drive Gemma or Model Derivatives to act in a manner that contravenes this Gemma Prohibited Use Policy.
22
+ 5. Generation of content that may harm or promote the harm of individuals or a group, such as:
23
+ 1. Generating content that promotes or encourages hatred;
24
+ 2. Facilitating methods of harassment or bullying to intimidate, abuse, or insult others;
25
+ 3. Generating content that facilitates, promotes, or incites violence;
26
+ 4. Generating content that facilitates, promotes, or encourages self harm;
27
+ 5. Generating personally identifying information for distribution or other harms;
28
+ 6. Tracking or monitoring people without their consent;
29
+ 7. Generating content that may have unfair or adverse impacts on people, particularly impacts related to sensitive or protected characteristics; or
30
+ 8. Generating, gathering, processing, or inferring sensitive personal or private information about individuals without obtaining all rights, authorizations, and consents required by applicable laws.
31
+ 3. Generate and distribute content intended to misinform, misrepresent or mislead, including:
32
+ 1. Misrepresentation of the provenance of generated content by claiming content was created by a human, or represent generated content as original works, in order to deceive;
33
+ 2. Generation of content that impersonates an individual (living or dead) without explicit disclosure, in order to deceive;
34
+ 3. Misleading claims of expertise or capability made particularly in sensitive areas (e.g. health, finance, government services, or legal);
35
+ 4. Making automated decisions in domains that affect material or individual rights or well-being (e.g., finance, legal, employment, healthcare, housing, insurance, and social welfare);
36
+ 5. Generation of defamatory content, including defamatory statements, images, or audio content; or
37
+ 6. Engaging in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices.
38
+ 4. Generate sexually explicit content, including content created for the purposes of pornography or sexual gratification (e.g. sexual chatbots). Note that this does not include content created for scientific, educational, documentary, or artistic purposes.
aratako_tts/T5Gemma-TTS-2b-2b/GEMMA_TERMS_OF_USE.md ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <br />
2
+
3
+ Last modified: March 24, 2025
4
+
5
+ By using, reproducing, modifying, distributing, performing or displaying any portion or element of Gemma, Model Derivatives including via any Hosted Service, (each as defined below) (collectively, the "**Gemma Services**") or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement.
6
+
7
+ ## Section 1: DEFINITIONS
8
+
9
+ ### 1.1 Definitions
10
+
11
+ (a) "**Agreement** " or "**Gemma Terms of Use**" means these terms and conditions that govern the use, reproduction, Distribution or modification of the Gemma Services and any terms and conditions incorporated by reference.
12
+
13
+ (b) "**Distribution** " or "**Distribute** " means any transmission, publication, or other sharing of Gemma or Model Derivatives to a third party, including by providing or making Gemma or its functionality available as a hosted service via API, web access, or any other electronic or remote means ("**Hosted Service**").
14
+
15
+ (c) "**Gemma** " means the set of machine learning language models, trained model weights and parameters identified in the[Appendix](https://ai.google.dev/gemma/terms#appendix), regardless of the source that you obtained it from.
16
+
17
+ (d) "**Google**" means Google LLC.
18
+
19
+ (e) "**Model Derivatives**" means all (i) modifications to Gemma, (ii) works based on Gemma, or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Gemma, to that model in order to cause that model to perform similarly to Gemma, including distillation methods that use intermediate data representations or methods based on the generation of synthetic data Outputs by Gemma for training that model. For clarity, Outputs are not deemed Model Derivatives.
20
+
21
+ (f) "**Output**" means the information content output of Gemma or a Model Derivative that results from operating or otherwise using Gemma or the Model Derivative, including via a Hosted Service.
22
+
23
+ ### 1.2
24
+
25
+ As used in this Agreement, "**including** " means "**including without limitation**".
26
+
27
+ ## Section 2: ELIGIBILITY AND USAGE
28
+
29
+ ### 2.1 Eligibility
30
+
31
+ You represent and warrant that you have the legal capacity to enter into this Agreement (including being of sufficient age of consent). If you are accessing or using any of the Gemma Services for or on behalf of a legal entity, (a) you are entering into this Agreement on behalf of yourself and that legal entity, (b) you represent and warrant that you have the authority to act on behalf of and bind that entity to this Agreement and (c) references to "**you** " or "**your**" in the remainder of this Agreement refers to both you (as an individual) and that entity.
32
+
33
+ ### 2.2 Use
34
+
35
+ You may use, reproduce, modify, Distribute, perform or display any of the Gemma Services only in accordance with the terms of this Agreement, and must not violate (or encourage or permit anyone else to violate) any term of this Agreement.
36
+
37
+ ## Section 3: DISTRIBUTION AND RESTRICTIONS
38
+
39
+ ### 3.1 Distribution and Redistribution
40
+
41
+ You may reproduce or Distribute copies of Gemma or Model Derivatives if you meet all of the following conditions:
42
+
43
+ 1. You must include the use restrictions referenced in Section 3.2 as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Gemma or Model Derivatives and you must provide notice to subsequent users you Distribute to that Gemma or Model Derivatives are subject to the use restrictions in Section 3.2.
44
+ 2. You must provide all third party recipients of Gemma or Model Derivatives a copy of this Agreement.
45
+ 3. You must cause any modified files to carry prominent notices stating that you modified the files.
46
+ 4. All Distributions (other than through a Hosted Service) must be accompanied by a "**Notice** " text file that contains the following notice: "**Gemma is provided under and subject to the Gemma Terms of Use found at ai.google.dev/gemma/terms**".
47
+
48
+ You may add your own intellectual property statement to your modifications and, except as set forth in this Section, may provide additional or different terms and conditions for use, reproduction, or Distribution of your modifications, or for any such Model Derivatives as a whole, provided your use, reproduction, modification, Distribution, performance, and display of Gemma otherwise complies with the terms and conditions of this Agreement. Any additional or different terms and conditions you impose must not conflict with the terms of this Agreement.
49
+
50
+ ### 3.2 Use Restrictions
51
+
52
+ You must not use any of the Gemma Services:
53
+
54
+ 1. for the restricted uses set forth in the Gemma Prohibited Use Policy at[ai.google.dev/gemma/prohibited_use_policy](https://ai.google.dev/gemma/prohibited_use_policy)("**Prohibited Use Policy**"), which is hereby incorporated by reference into this Agreement; or
55
+ 2. in violation of applicable laws and regulations.
56
+
57
+ To the maximum extent permitted by law, Google reserves the right to restrict (remotely or otherwise) usage of any of the Gemma Services that Google reasonably believes are in violation of this Agreement.
58
+
59
+ ### 3.3 Generated Output
60
+
61
+ Google claims no rights in Outputs you generate using Gemma. You and your users are solely responsible for Outputs and their subsequent uses.
62
+
63
+ ## Section 4: ADDITIONAL PROVISIONS
64
+
65
+ ### 4.1 Updates
66
+
67
+ Google may update Gemma from time to time.
68
+
69
+ ### 4.2 Trademarks
70
+
71
+ Nothing in this Agreement grants you any rights to use Google's trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between you and Google. Google reserves any rights not expressly granted herein.
72
+
73
+ ### 4.3 DISCLAIMER OF WARRANTY
74
+
75
+ UNLESS REQUIRED BY APPLICABLE LAW, THE GEMMA SERVICES, AND OUTPUTS, ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE GEMMA SERVICES OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR USE OR DISTRIBUTION OF ANY OF THE GEMMA SERVICES OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
76
+
77
+ ### 4.4 LIMITATION OF LIABILITY
78
+
79
+ TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY, CONTRACT, OR OTHERWISE, UNLESS REQUIRED BY APPLICABLE LAW, SHALL GOOGLE OR ITS AFFILIATES BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL, OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO, ANY OF THE GEMMA SERVICES OR OUTPUTS EVEN IF GOOGLE OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
80
+
81
+ ### 4.5 Term, Termination, and Survival
82
+
83
+ The term of this Agreement will commence upon your acceptance of this Agreement (including acceptance by your use, modification, or Distribution, reproduction, performance or display of any portion or element of the Gemma Services) and will continue in full force and effect until terminated in accordance with the terms of this Agreement. Google may terminate this Agreement if you are in breach of any term of this Agreement. Upon termination of this Agreement, you must delete and cease use and Distribution of all copies of Gemma and Model Derivatives in your possession or control. Sections 1, 2.1, 3.3, 4.2 to 4.9 shall survive the termination of this Agreement.
84
+
85
+ ### 4.6 Governing Law and Jurisdiction
86
+
87
+ This Agreement will be governed by the laws of the State of California without regard to choice of law principles. The UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The state and federal courts of Santa Clara County, California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
88
+
89
+ ### 4.7 Severability
90
+
91
+ If any provision of this Agreement is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
92
+
93
+ ### 4.8 Entire Agreement
94
+
95
+ This Agreement states all the terms agreed between the parties and supersedes all other agreements between the parties as of the date of acceptance relating to its subject matter.
96
+
97
+ ### 4.9 No Waiver
98
+
99
+ Google will not be treated as having waived any rights by not exercising (or delaying the exercise of) any rights under this Agreement.
100
+
101
+ ## Appendix
102
+
103
+ - [Gemma 1](https://ai.google.dev/gemma/docs/core/model_card)
104
+ - [Gemma 1.1](https://ai.google.dev/gemma/docs/core/model_card)
105
+ - [Gemma 2](https://ai.google.dev/gemma/docs/core/model_card_2)
106
+ - [Gemma 3](https://ai.google.dev/gemma/docs/core/model_card_3)
107
+ - [Gemma 3n](https://ai.google.dev/gemma/docs/3n)
108
+ - [EmbeddingGemma](https://ai.google.dev/gemma/docs/embeddinggemma)
109
+ - [PaliGemma](https://ai.google.dev/gemma/docs/paligemma/model-card)
110
+ - [PaliGemma 2](https://ai.google.dev/gemma/docs/paligemma/model-card-2)
111
+ - [ShieldGemma](https://ai.google.dev/gemma/docs/shieldgemma/model_card)
112
+ - [ShieldGemma 2](https://ai.google.dev/gemma/docs/shieldgemma/model_card_2)
113
+ - [CodeGemma](https://ai.google.dev/gemma/docs/codegemma/model_card)
114
+ - [CodeGemma 1.1](https://ai.google.dev/gemma/docs/codegemma/model_card)
115
+ - [Gemma 2 JPN](https://huggingface.co/google/gemma-2-2b-jpn-it)
116
+ - [DataGemma RIG](https://www.kaggle.com/models/google/datagemma-rig)
117
+ - [DataGemma RAG](https://www.kaggle.com/models/google/datagemma-rag)
118
+ - [RecurrentGemma](https://ai.google.dev/gemma/docs/recurrentgemma/model_card)
119
+ - [Gemma Scope](https://ai.google.dev/gemma/docs/gemma_scope)
120
+ - [Gemma-APS](https://ai.google.dev/gemma/docs/gemma-aps)
121
+ - [T5Gemma](https://www.kaggle.com/models/google/t5gemma)
122
+ - [VaultGemma](https://www.kaggle.com/models/google/vaultgemma)
123
+
124
+ | **Note:** Previous versions of these Terms are[archived here](https://ai.google.dev/gemma/terms-archive).
aratako_tts/T5Gemma-TTS-2b-2b/NOTICE ADDED
@@ -0,0 +1 @@
 
 
1
+ Gemma is provided under and subject to the Gemma Terms of Use found at ai.google.dev/gemma/terms
aratako_tts/T5Gemma-TTS-2b-2b/README.md ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license:
3
+ - gemma
4
+ - cc-by-nc-4.0
5
+ language:
6
+ - en
7
+ - zh
8
+ - ja
9
+ base_model:
10
+ - google/t5gemma-2b-2b-ul2
11
+ pipeline_tag: text-to-speech
12
+ library_name: transformers
13
+ tags:
14
+ - speech
15
+ - tts
16
+ datasets:
17
+ - amphion/Emilia-Dataset
18
+ - pkufool/libriheavy
19
+ extra_gated_heading: License & Ethics Agreement
20
+ extra_gated_description: >-
21
+ This model is for **Non-Commercial Use Only** (CC-BY-NC 4.0) and follows the **Gemma Terms of Use**.
22
+ Malicious use, including impersonation, is strictly prohibited.
23
+ extra_gated_button_content: Agree and Access
24
+ ---
25
+
26
+ # T5Gemma-TTS-2b-2b
27
+
28
+ [![GitHub](https://img.shields.io/badge/Code-GitHub-black)](https://github.com/Aratako/T5Gemma-TTS) [![WandB](https://img.shields.io/badge/Training%20Log-WandB-orange)](https://api.wandb.ai/links/aratako-lm/kfti30sc) [![Demo Space](https://img.shields.io/badge/Demo-HuggingFace%20Space-blue)](https://huggingface.co/spaces/Aratako/T5Gemma-TTS-Demo)
29
+
30
+ **[日本語版 README はこちら](https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/blob/main/README_ja.md)**
31
+
32
+ **T5Gemma-TTS-2b-2b** is a multilingual Text-to-Speech (TTS) model developed as a personal project. It utilizes an Encoder-Decoder LLM architecture, supporting English, Chinese, and Japanese.
33
+
34
+ ## 🌟 Overview
35
+
36
+ This model is an Encoder-Decoder LLM based TTS system initialized from the weights of [google/t5gemma-2b-2b-ul2](https://huggingface.co/google/t5gemma-2b-2b-ul2). While it leverages pre-trained LLM weights, the audio component has been trained from scratch specifically for TTS tasks.
37
+
38
+ You can try the interactive demo on Hugging Face Spaces: **[T5Gemma-TTS Demo](https://huggingface.co/spaces/Aratako/T5Gemma-TTS-Demo)**
39
+
40
+ ### Key Features
41
+
42
+ * **Multilingual Support:** Supports **English, Chinese, and Japanese**.
43
+ * **Voice Cloning:** Capable of zero-shot voice cloning from reference audio.
44
+ * **Duration Control:** Allows users to control the speed and length of the generated audio explicitly.
45
+ * **Open Source Code:** Training code and inference scripts are available on GitHub.
46
+
47
+ > **Note:** This is a hobby project. There are no formal objective evaluation metrics (WER/CER, SIM-O, etc.) available at this time.
48
+
49
+ ## 🏗️ Technical Details
50
+
51
+ ### Architecture
52
+
53
+ ![](https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/architecture.png)
54
+
55
+ The architecture is inspired by **VoiceStar** ([arXiv:2505.19462](https://arxiv.org/abs/2505.19462)). It adopts mechanisms such as **PM-RoPE** for length control.
56
+
57
+ * **Base Model:** [google/t5gemma-2b-2b-ul2](https://huggingface.co/google/t5gemma-2b-2b-ul2) (Weights used for initialization).
58
+ * **Audio Codec:** [XCodec2](HKUSTAudio/xcodec2) and its derivatives.
59
+
60
+ ### Training Data
61
+
62
+ The model was trained on approximately **170,000 hours** of publicly available speech datasets (mainly [Emilia](https://huggingface.co/datasets/amphion/Emilia-Dataset) and [libriheavy](https://huggingface.co/datasets/pkufool/libriheavy)).
63
+
64
+ | Language | Approx. Hours |
65
+ | :--- | :--- |
66
+ | **English** | ~100k hours |
67
+ | **Chinese** | ~50k hours |
68
+ | **Japanese** | ~20k hours |
69
+
70
+ ### Training Hardware
71
+
72
+ Training was conducted on the **AMD Developer Cloud** using **8x MI300X** GPUs for approximately 2 weeks.
73
+
74
+ * You can check the training logs here: [WandB](https://api.wandb.ai/links/aratako-lm/kfti30sc)
75
+
76
+ ## 🎧 Audio Samples
77
+
78
+ Below are some samples generated by T5Gemma-TTS-2b-2b.
79
+
80
+ ### 1. Multilingual TTS
81
+
82
+ Basic text-to-speech generation in supported languages.
83
+
84
+ | Language | Text Prompt | Audio |
85
+ | :--- | :--- | :--- |
86
+ | **English** | "The old library was silent, save for the gentle ticking of a clock somewhere in the shadows. As I ran my fingers along the dusty spines of the books, I felt a strange sense of nostalgia, as if I had lived a thousand lives within these walls." | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/en_sample1.wav?download=true"></audio> |
87
+ | **Chinese** | "那是一个宁静的夜晚,月光洒在湖面上,波光粼粼。微风轻拂,带来了远处花朵的清香。我独自坐在岸边,心中涌起一股莫名的感动,仿佛整个世界都在这一刻静止了。" | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/zh_sample1.wav?download=true"></audio> |
88
+ | **Japanese** | "その森には、古い言い伝えがありました。月が最も高く昇る夜、静かに耳を澄ませば、風の歌声が聞こえるというのです。私は半信半疑でしたが、その夜、確かに誰かが私を呼ぶ声を聞いたのです。" | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/jp_sample1.wav?download=true"></audio> |
89
+
90
+ ### 2. Duration Control
91
+
92
+ Examples of generating the same text with different duration constraints.
93
+
94
+ **English Sample**
95
+ > Text: *"This new model allows users to strictly control the duration of the generated speech.*
96
+
97
+ | Target Duration | Generated Audio |
98
+ | :--- | :--- |
99
+ | **3.0s (Fast)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/en_sample2_fast.wav"></audio> |
100
+ | **5.0s (Normal)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/en_sample2_normal.wav"></audio> |
101
+ | **7.0s (Slow)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/en_sample2_slow.wav"></audio> |
102
+
103
+ **Japanese Sample**
104
+ > Text: *"このモデルでは、生成音声の長さを自由に調整できます。"*
105
+
106
+ | Target Duration | Generated Audio |
107
+ | :--- | :--- |
108
+ | **3.0s (Fast)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/jp_sample2_fast.wav"></audio> |
109
+ | **5.0s (Normal)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/jp_sample2_normal.wav"></audio> |
110
+ | **7.0s (Slow)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/jp_sample2_slow.wav"></audio> |
111
+
112
+ ### 3. Voice Cloning (Zero-shot)
113
+
114
+ Examples of cloning a voice from a reference audio clip.
115
+
116
+ > **Note:** The reference audio samples below were generated using **[NandemoGHS/Anime-Llasa-3B](https://huggingface.co/NandemoGHS/Anime-Llasa-3B)** and **[gemini-2.5-pro-preview-tts](https://cloud.google.com/text-to-speech/docs/gemini-tts)**.
117
+
118
+ | Case | Reference Audio | Generated Audio |
119
+ | :--- | :--- | :--- |
120
+ | **Example 1** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/ref_sample1.wav"></audio> | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/gen_sample1.wav"></audio> |
121
+ | **Example 2** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/ref_sample2.wav"></audio> | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/gen_sample2.wav"></audio> |
122
+ | **Example 3** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/ref_sample3.wav"></audio> | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b-resources/resolve/main/samples/gen_sample3.wav"></audio> |
123
+
124
+ ## 🚀 Usage
125
+
126
+ For inference code, installation instructions, and training scripts, please refer to the GitHub repository:
127
+
128
+ 👉 **[GitHub](https://github.com/Aratako/T5Gemma-TTS)**
129
+
130
+ ## ⚠️ Limitations
131
+
132
+ - **Inference Speed:** The model is not optimized for real-time TTS applications. Autoregressive generation of audio tokens takes significant time, making it unsuitable for low-latency use cases.
133
+ - **Duration Control:** While the model supports explicit duration specification, control is not perfect. Generated audio may differ from the specified duration, and even when the duration matches, the speech pacing or naturalness may not always be optimal.
134
+ - **Audio Quality:** Quality depends on training data characteristics. Performance may vary for voices, accents, or speaking styles underrepresented in the training data.
135
+
136
+ ## 📜 License
137
+
138
+ This model is released under a **Dual License** policy. Users must strictly comply with **BOTH** of the following sets of terms:
139
+
140
+ 1. **[Gemma Terms of Use](https://ai.google.dev/gemma/terms):** Since this model is derived from `google/t5gemma-2b-2b-ul2`, you must adhere to the Gemma Terms of Use.
141
+ 2. **[CC-BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/):** Due to the constraints of the training datasets (such as Emilia), this model is restricted to **Non-Commercial Use Only**.
142
+
143
+ > **⚠️ Important Note on Codec:**
144
+ > The audio codec used, **XCodec2**, is also released under a **CC-BY-NC** license. Please ensure you also follow their license terms when using the generated audio.
145
+
146
+ **Ethical Restrictions:**
147
+ Do not use this model to impersonate specific individuals (e.g., voice cloning of voice actors, celebrities, or public figures) without their explicit consent.
148
+
149
+ ## 🙏 Acknowledgments
150
+
151
+ I would like to thank the following for their open-source contributions, which made this project possible:
152
+
153
+ - [VoiceStar](https://arxiv.org/abs/2505.19462) - Architecture inspiration
154
+ - [T5Gemma](https://huggingface.co/google/t5gemma-2b-2b-ul2) - Base model
155
+ - [XCodec2](https://huggingface.co/HKUSTAudio/xcodec2) and [XCodec2-Variant](https://huggingface.co/NandemoGHS/Anime-XCodec2-44.1kHz-v2) - Audio codec
156
+
157
+ ## 🖊️ Citation
158
+
159
+ If you cite this model, please cite it as follows:
160
+
161
+ ```bibtex
162
+ @misc{t5gemma-tts,
163
+ author = {Aratako},
164
+ title = {T5Gemma-TTS-2b-2b: An Encoder-Decoder LLM-based TTS Model},
165
+ year = {2025},
166
+ publisher = {Hugging Face},
167
+ journal = {Hugging Face repository},
168
+ howpublished = {\url{https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b}}
169
+ }
170
+ ```
aratako_tts/T5Gemma-TTS-2b-2b/README_ja.md ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license:
3
+ - gemma
4
+ - cc-by-nc-4.0
5
+ language:
6
+ - en
7
+ - zh
8
+ - ja
9
+ base_model:
10
+ - google/t5gemma-2b-2b-ul2
11
+ pipeline_tag: text-to-speech
12
+ library_name: transformers
13
+ tags:
14
+ - speech
15
+ - tts
16
+ datasets:
17
+ - amphion/Emilia-Dataset
18
+ - pkufool/libriheavy
19
+ ---
20
+
21
+ # T5Gemma-TTS-2b-2b
22
+
23
+ [![GitHub](https://img.shields.io/badge/Code-GitHub-black)](https://github.com/Aratako/T5Gemma-TTS) [![WandB](https://img.shields.io/badge/Training%20Log-WandB-orange)](https://api.wandb.ai/links/aratako-lm/kfti30sc) [![Demo Space](https://img.shields.io/badge/Demo-HuggingFace%20Space-blue)](https://huggingface.co/spaces/Aratako/T5Gemma-TTS-Demo)
24
+
25
+ **T5Gemma-TTS-2b-2b** は、個人プロジェクトとして開発された多言語対応の音声合成(TTS)モデルです。Encoder-Decoder型のLLMアーキテクチャを採用しており、英語、中国語、日本語に対応しています。
26
+
27
+ ## 🌟 概要
28
+
29
+ 本モデルは、[google/t5gemma-2b-2b-ul2](https://huggingface.co/google/t5gemma-2b-2b-ul2) の重みで初期化されたEncoder-DecoderのLLMベースのTTSシステムです。事前学習済みのLLMの重みを活用していますが、TTSタスク専用に音声部分はスクラッチから学習を行っています。
30
+
31
+ Hugging Face Spaces にてインタラクティブなデモをお試しいただけます: **[T5Gemma-TTS Demo](https://huggingface.co/spaces/Aratako/T5Gemma-TTS-Demo)**
32
+
33
+ ### 主な特徴
34
+
35
+ * **多言語対応:** **英語、中国語、日本語**をサポートしています。
36
+ * **音声クローン:** 参照音声からのzero-shot voice cloningが可能です。
37
+ * **発話時間の制御:** 生成される音声の長さをユーザーが明示的に制御可能です。
38
+ * **オープンソースなコード:** 学習コードおよび推論スクリプトをGitHubで公開しています。
39
+
40
+ > **Note:** 本モデルは個人開発プロジェクトとして開発されたものです。現時点では、客観的な評価指標(WER/CER、SIM-Oなど)のデータはありません。
41
+
42
+ ## 🏗️ 技術的詳細
43
+
44
+ ### アーキテクチャ
45
+
46
+ ![](https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/architecture.png)
47
+
48
+ アーキテクチャは **VoiceStar** ([arXiv:2505.19462](https://arxiv.org/abs/2505.19462)) に着想を得て設計されています。長さ制御のための **PM-RoPE** などの機構を採用しています。
49
+
50
+ * **ベースモデル:** [google/t5gemma-2b-2b-ul2](https://huggingface.co/google/t5gemma-2b-2b-ul2) (重みの初期化に使用)
51
+ * **音声コーデック:** [XCodec2](HKUSTAudio/xcodec2) およびその派生版
52
+
53
+ ### 学習データ
54
+
55
+ 約 **170,000時間** の公開音声データセット(主に[Emilia](https://huggingface.co/datasets/amphion/Emilia-Dataset)、[libriheavy](https://huggingface.co/datasets/pkufool/libriheavy))を使用して学習を行いました。
56
+
57
+ | 言語 | およそのデータ量 |
58
+ | :--- | :--- |
59
+ | **英語** | ~100k 時間 |
60
+ | **中国語** | ~50k 時間 |
61
+ | **日本語** | ~20k 時間 |
62
+
63
+ ### 学習ハードウェア
64
+
65
+ 学習には **AMD Developer Cloud** を使用し、**8x MI300X** GPUを用いて約2週間トレーニングを行いました。
66
+
67
+ * 学習ログはこちらで確認できます: [WandB](https://api.wandb.ai/links/aratako-lm/kfti30sc)
68
+
69
+ ## 🎧 音声サンプル
70
+
71
+ T5Gemma-TTS-2b-2bによって生成されたサンプル音声です。
72
+
73
+ ### 1. 多言語TTS
74
+
75
+ 各言語での基本的なテキスト読み上げの生成例です。
76
+
77
+ | 言語 | テキストプロンプト | 音声 |
78
+ | :--- | :--- | :--- |
79
+ | **English** | "The old library was silent, save for the gentle ticking of a clock somewhere in the shadows. As I ran my fingers along the dusty spines of the books, I felt a strange sense of nostalgia, as if I had lived a thousand lives within these walls." | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/en_sample1.wav?download=true"></audio> |
80
+ | **Chinese** | "那是一个宁静的夜晚,月光洒在湖面上,波光粼粼。微风轻拂,带来了远处花朵的清香。我独自坐在岸边,心中涌起一股莫名的感动,仿佛整个世界都在这一刻静止了。" | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/zh_sample1.wav?download=true"></audio> |
81
+ | **Japanese** | "その森には、古い言い伝えがありました。月が最も高く昇る夜、静かに耳を澄ませば、風の歌声が聞こえるというのです。私は半信半疑でしたが、その夜、確かに誰かが私を呼ぶ声を聞いたのです。" | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/jp_sample1.wav?download=true"></audio> |
82
+
83
+ ### 2. 発話時間制御
84
+
85
+ 同じテキストを異なる長さの制約を与えて生成した例です。
86
+
87
+ **英語の例**
88
+ > Text: *"This new model allows users to strictly control the duration of the generated speech.*
89
+
90
+ | 目標時間 | 生成音声 |
91
+ | :--- | :--- |
92
+ | **3.0s (Fast)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/en_sample2_fast.wav"></audio> |
93
+ | **5.0s (Normal)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/en_sample2_normal.wav"></audio> |
94
+ | **7.0s (Slow)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/en_sample2_slow.wav"></audio> |
95
+
96
+ **日本語の例**
97
+ > Text: *"このモデルでは、生成音声の長さを自由に調整できます。"*
98
+
99
+ | 目標時間 | 生成音声 |
100
+ | :--- | :--- |
101
+ | **3.0s (Fast)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/jp_sample2_fast.wav"></audio> |
102
+ | **5.0s (Normal)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/jp_sample2_normal.wav"></audio> |
103
+ | **7.0s (Slow)** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/jp_sample2_slow.wav"></audio> |
104
+
105
+ ### 3. Voice Cloning (Zero-shot)
106
+
107
+ 参照音声を使用したVoice Cloningの例です。
108
+
109
+ > **Note:** 以下の参照音声は **[NandemoGHS/Anime-Llasa-3B](https://huggingface.co/NandemoGHS/Anime-Llasa-3B)** および **[gemini-2.5-pro-preview-tts](https://cloud.google.com/text-to-speech/docs/gemini-tts)** を使用して生成されたものです。
110
+
111
+ | ケース | 参照音声 (Reference) | 生成音声 (Generated) |
112
+ | :--- | :--- | :--- |
113
+ | **Example 1** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/ref_sample1.wav"></audio> | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/gen_sample1.wav"></audio> |
114
+ | **Example 2** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/ref_sample2.wav"></audio> | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/gen_sample2.wav"></audio> |
115
+ | **Example 3** | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/ref_sample3.wav"></audio> | <audio controls src="https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b/resolve/main/samples/gen_sample3.wav"></audio> |
116
+
117
+ ## 🚀 使用方法
118
+
119
+ 推論コード、インストール手順、および学習スクリプトについては、以下のGitHubリポジトリを参照してください。
120
+
121
+ 👉 **[GitHub](https://github.com/Aratako/T5Gemma-TTS)**
122
+
123
+ ## ⚠️ 制限事項
124
+
125
+ - **推論速度:** 本モデルはリアルタイムTTS向けに最適化されていません。音声トークンの自己回帰生成には相応の時間がかかるため、低遅延が求められる用途には適していません。
126
+ - **時間制御:** 生成時間の明示的な指定は可能ですが、制御は完全ではありません。指定した時間と実際の音声長が異なったり、時間が合っていても発話のペースや自然さが損なわれる場合があります。
127
+ - **音質:** 音質は学習データの特徴に依存します。学習データに含まれていない声質、アクセント、話し方などでは性能が低下する可能性があります。
128
+
129
+ ## 📜 ライセンス
130
+
131
+ 本モデルは **デュアルライセンス** ポリシーの下で公開されています。利用者は以下の**両方**の条件を厳守する必要があります。
132
+
133
+ 1. **[Gemma Terms of Use](https://ai.google.dev/gemma/terms):** 本モデルは `google/t5gemma-2b-2b-ul2` から派生しているため、Gemmaの利用規約に従う必要があります。
134
+ 2. **[CC-BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/):** 学習データセット(Emiliaなど)の制約により、本モデルは **非商用利用のみ (Non-Commercial Use Only)** に制限されます。
135
+
136
+ > **⚠️ コーデックに関する重要事項:**
137
+ > 使用している音声コーデック **XCodec2** も **CC-BY-NC** ライセンスの下で公開されています。生成された音声を使用する際は、こちらのライセンス条件も遵守してください。
138
+
139
+ **倫理的制約:**
140
+ 本モデルを使用して、特定の個人(声優、著名人、公人など)の声を、本人の明示的な同意なくクローン(なりすまし)することを禁止します。
141
+
142
+ ## 🙏 謝辞
143
+
144
+ 本プロジェクトを実現するにあたり、以下のオープンソースプロジェクトおよびリソースに感謝いたします。
145
+
146
+ - [VoiceStar](https://arxiv.org/abs/2505.19462) - アーキテクチャの着想元
147
+ - [T5Gemma](https://huggingface.co/google/t5gemma-2b-2b-ul2) - ベースモデル
148
+ - [XCodec2](https://huggingface.co/HKUSTAudio/xcodec2) および [XCodec2-Variant](https://huggingface.co/NandemoGHS/Anime-XCodec2-44.1kHz-v2) - 音声コーデック
149
+
150
+ ## 🖊️ 引用 (Citation)
151
+
152
+ 本モデルを引用する場合は、以下のように引用してください。
153
+
154
+ ```bibtex
155
+ @misc{t5gemma-tts,
156
+ author = {Aratako},
157
+ title = {T5Gemma-TTS-2b-2b: An Encoder-Decoder LLM-based TTS Model},
158
+ year = {2025},
159
+ publisher = {Hugging Face},
160
+ journal = {Hugging Face repository},
161
+ howpublished = {\url{[https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b](https://huggingface.co/Aratako/T5Gemma-TTS-2b-2b)}}
162
+ }
aratako_tts/T5Gemma-TTS-2b-2b/architecture.png ADDED

Git LFS Details

  • SHA256: cc43fc049242400b0b0300ecf7dba26efa77491991a74c44eea08e8f0e436dfd
  • Pointer size: 132 Bytes
  • Size of remote file: 1.67 MB
aratako_tts/T5Gemma-TTS-2b-2b/ckpt/pretrained.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b901434868b1b9d881880c3397802ed19a0718c3f71e22e7361486f072f20e9
3
+ size 10629148381
aratako_tts/T5Gemma-TTS-2b-2b/config.json ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_to_text": 0,
3
+ "add_eos_to_text": 1,
4
+ "architectures": [
5
+ "T5GemmaVoiceForConditionalGeneration"
6
+ ],
7
+ "attn_implementation": "sdpa",
8
+ "audio_mask_token": 1024,
9
+ "audio_max_length": 40.0,
10
+ "audio_pad_token": 65538,
11
+ "audio_tokenizer": "xcodec2",
12
+ "audio_vocab_size": 65536,
13
+ "auto_map": {
14
+ "AutoConfig": "configuration_t5gemma_voice.T5GemmaVoiceConfig",
15
+ "AutoModelForSeq2SeqLM": "modeling_t5gemma_voice.T5GemmaVoiceForConditionalGeneration"
16
+ },
17
+ "bos_token_id": 65536,
18
+ "codec_audio_sr": 44100,
19
+ "dtype": "bfloat16",
20
+ "empty_token": 65536,
21
+ "encodec_sr": 50.0,
22
+ "eog": 65537,
23
+ "eos": 65539,
24
+ "eos_token_id": 65539,
25
+ "extra_cutoff": 5,
26
+ "model_type": "t5gemma_voice",
27
+ "n_codebooks": 1,
28
+ "n_special": 5,
29
+ "pad_token_id": 65538,
30
+ "parallel_pattern": 0,
31
+ "precision": "bfloat16",
32
+ "progress_lookahead_secs": 2.0,
33
+ "progress_scale": 2000.0,
34
+ "prune_text_modules": 2,
35
+ "special_first": 0,
36
+ "t5_config_dict": {
37
+ "_name_or_path": "",
38
+ "add_cross_attention": false,
39
+ "architectures": [
40
+ "T5GemmaForConditionalGeneration"
41
+ ],
42
+ "attention_dropout": 0.0,
43
+ "bad_words_ids": null,
44
+ "begin_suppress_tokens": null,
45
+ "bos_token_id": 2,
46
+ "chunk_size_feed_forward": 0,
47
+ "classifier_dropout_rate": 0.0,
48
+ "cross_attention_hidden_size": null,
49
+ "decoder": {
50
+ "_name_or_path": "",
51
+ "add_cross_attention": false,
52
+ "architectures": null,
53
+ "attention_bias": false,
54
+ "attention_dropout": 0.0,
55
+ "attn_logit_softcapping": 50.0,
56
+ "bad_words_ids": null,
57
+ "begin_suppress_tokens": null,
58
+ "bos_token_id": 2,
59
+ "chunk_size_feed_forward": 0,
60
+ "classifier_dropout_rate": 0.0,
61
+ "cross_attention_hidden_size": 2304,
62
+ "decoder_start_token_id": null,
63
+ "diversity_penalty": 0.0,
64
+ "do_sample": false,
65
+ "dropout_rate": 0.0,
66
+ "dtype": "bfloat16",
67
+ "early_stopping": false,
68
+ "encoder_no_repeat_ngram_size": 0,
69
+ "eos_token_id": 1,
70
+ "exponential_decay_length_penalty": null,
71
+ "final_logit_softcapping": 30.0,
72
+ "finetuning_task": null,
73
+ "forced_bos_token_id": null,
74
+ "forced_eos_token_id": null,
75
+ "head_dim": 256,
76
+ "hidden_activation": "gelu_pytorch_tanh",
77
+ "hidden_size": 2304,
78
+ "id2label": {
79
+ "0": "LABEL_0",
80
+ "1": "LABEL_1"
81
+ },
82
+ "initializer_range": 0.02,
83
+ "intermediate_size": 9216,
84
+ "is_decoder": true,
85
+ "is_encoder_decoder": false,
86
+ "label2id": {
87
+ "LABEL_0": 0,
88
+ "LABEL_1": 1
89
+ },
90
+ "layer_types": [
91
+ "sliding_attention",
92
+ "full_attention",
93
+ "sliding_attention",
94
+ "full_attention",
95
+ "sliding_attention",
96
+ "full_attention",
97
+ "sliding_attention",
98
+ "full_attention",
99
+ "sliding_attention",
100
+ "full_attention",
101
+ "sliding_attention",
102
+ "full_attention",
103
+ "sliding_attention",
104
+ "full_attention",
105
+ "sliding_attention",
106
+ "full_attention",
107
+ "sliding_attention",
108
+ "full_attention",
109
+ "sliding_attention",
110
+ "full_attention",
111
+ "sliding_attention",
112
+ "full_attention",
113
+ "sliding_attention",
114
+ "full_attention",
115
+ "sliding_attention",
116
+ "full_attention"
117
+ ],
118
+ "length_penalty": 1.0,
119
+ "max_length": 20,
120
+ "max_position_embeddings": 8192,
121
+ "min_length": 0,
122
+ "model_type": "t5_gemma_module",
123
+ "no_repeat_ngram_size": 0,
124
+ "num_attention_heads": 8,
125
+ "num_beam_groups": 1,
126
+ "num_beams": 1,
127
+ "num_hidden_layers": 26,
128
+ "num_key_value_heads": 4,
129
+ "num_return_sequences": 1,
130
+ "output_attentions": false,
131
+ "output_hidden_states": false,
132
+ "output_scores": false,
133
+ "pad_token_id": 0,
134
+ "prefix": null,
135
+ "problem_type": null,
136
+ "pruned_heads": {},
137
+ "query_pre_attn_scalar": 256,
138
+ "remove_invalid_values": false,
139
+ "repetition_penalty": 1.0,
140
+ "return_dict": true,
141
+ "return_dict_in_generate": false,
142
+ "rms_norm_eps": 1e-06,
143
+ "rope_theta": 10000.0,
144
+ "sep_token_id": null,
145
+ "sliding_window": 4096,
146
+ "suppress_tokens": null,
147
+ "task_specific_params": null,
148
+ "temperature": 1.0,
149
+ "tf_legacy_loss": false,
150
+ "tie_encoder_decoder": false,
151
+ "tie_input_output_embeddings": false,
152
+ "tie_word_embeddings": false,
153
+ "tokenizer_class": null,
154
+ "top_k": 50,
155
+ "top_p": 1.0,
156
+ "torchscript": false,
157
+ "typical_p": 1.0,
158
+ "use_bfloat16": false,
159
+ "use_cache": true,
160
+ "vocab_size": 256000
161
+ },
162
+ "decoder_start_token_id": null,
163
+ "diversity_penalty": 0.0,
164
+ "do_sample": false,
165
+ "dropout_rate": 0.0,
166
+ "dtype": "bfloat16",
167
+ "early_stopping": false,
168
+ "encoder": {
169
+ "_name_or_path": "",
170
+ "add_cross_attention": false,
171
+ "architectures": null,
172
+ "attention_bias": false,
173
+ "attention_dropout": 0.0,
174
+ "attn_logit_softcapping": 50.0,
175
+ "bad_words_ids": null,
176
+ "begin_suppress_tokens": null,
177
+ "bos_token_id": 2,
178
+ "chunk_size_feed_forward": 0,
179
+ "classifier_dropout_rate": 0.0,
180
+ "cross_attention_hidden_size": null,
181
+ "decoder_start_token_id": null,
182
+ "diversity_penalty": 0.0,
183
+ "do_sample": false,
184
+ "dropout_rate": 0.0,
185
+ "dtype": "bfloat16",
186
+ "early_stopping": false,
187
+ "encoder_no_repeat_ngram_size": 0,
188
+ "eos_token_id": 1,
189
+ "exponential_decay_length_penalty": null,
190
+ "final_logit_softcapping": 30.0,
191
+ "finetuning_task": null,
192
+ "forced_bos_token_id": null,
193
+ "forced_eos_token_id": null,
194
+ "head_dim": 256,
195
+ "hidden_activation": "gelu_pytorch_tanh",
196
+ "hidden_size": 2304,
197
+ "id2label": {
198
+ "0": "LABEL_0",
199
+ "1": "LABEL_1"
200
+ },
201
+ "initializer_range": 0.02,
202
+ "intermediate_size": 9216,
203
+ "is_decoder": false,
204
+ "is_encoder_decoder": false,
205
+ "label2id": {
206
+ "LABEL_0": 0,
207
+ "LABEL_1": 1
208
+ },
209
+ "layer_types": [
210
+ "sliding_attention",
211
+ "full_attention",
212
+ "sliding_attention",
213
+ "full_attention",
214
+ "sliding_attention",
215
+ "full_attention",
216
+ "sliding_attention",
217
+ "full_attention",
218
+ "sliding_attention",
219
+ "full_attention",
220
+ "sliding_attention",
221
+ "full_attention",
222
+ "sliding_attention",
223
+ "full_attention",
224
+ "sliding_attention",
225
+ "full_attention",
226
+ "sliding_attention",
227
+ "full_attention",
228
+ "sliding_attention",
229
+ "full_attention",
230
+ "sliding_attention",
231
+ "full_attention",
232
+ "sliding_attention",
233
+ "full_attention",
234
+ "sliding_attention",
235
+ "full_attention"
236
+ ],
237
+ "length_penalty": 1.0,
238
+ "max_length": 20,
239
+ "max_position_embeddings": 8192,
240
+ "min_length": 0,
241
+ "model_type": "t5_gemma_module",
242
+ "no_repeat_ngram_size": 0,
243
+ "num_attention_heads": 8,
244
+ "num_beam_groups": 1,
245
+ "num_beams": 1,
246
+ "num_hidden_layers": 26,
247
+ "num_key_value_heads": 4,
248
+ "num_return_sequences": 1,
249
+ "output_attentions": false,
250
+ "output_hidden_states": false,
251
+ "output_scores": false,
252
+ "pad_token_id": 0,
253
+ "prefix": null,
254
+ "problem_type": null,
255
+ "pruned_heads": {},
256
+ "query_pre_attn_scalar": 256,
257
+ "remove_invalid_values": false,
258
+ "repetition_penalty": 1.0,
259
+ "return_dict": true,
260
+ "return_dict_in_generate": false,
261
+ "rms_norm_eps": 1e-06,
262
+ "rope_theta": 10000.0,
263
+ "sep_token_id": null,
264
+ "sliding_window": 4096,
265
+ "suppress_tokens": null,
266
+ "task_specific_params": null,
267
+ "temperature": 1.0,
268
+ "tf_legacy_loss": false,
269
+ "tie_encoder_decoder": false,
270
+ "tie_input_output_embeddings": false,
271
+ "tie_word_embeddings": false,
272
+ "tokenizer_class": null,
273
+ "top_k": 50,
274
+ "top_p": 1.0,
275
+ "torchscript": false,
276
+ "typical_p": 1.0,
277
+ "use_bfloat16": false,
278
+ "use_cache": true,
279
+ "vocab_size": 256000
280
+ },
281
+ "encoder_no_repeat_ngram_size": 0,
282
+ "eos_token_id": [
283
+ 1,
284
+ 107
285
+ ],
286
+ "exponential_decay_length_penalty": null,
287
+ "finetuning_task": null,
288
+ "forced_bos_token_id": null,
289
+ "forced_eos_token_id": null,
290
+ "id2label": {
291
+ "0": "LABEL_0",
292
+ "1": "LABEL_1"
293
+ },
294
+ "initializer_range": 0.02,
295
+ "is_decoder": false,
296
+ "is_encoder_decoder": true,
297
+ "label2id": {
298
+ "LABEL_0": 0,
299
+ "LABEL_1": 1
300
+ },
301
+ "length_penalty": 1.0,
302
+ "max_length": 20,
303
+ "min_length": 0,
304
+ "model_type": "t5gemma",
305
+ "no_repeat_ngram_size": 0,
306
+ "num_beam_groups": 1,
307
+ "num_beams": 1,
308
+ "num_return_sequences": 1,
309
+ "output_attentions": false,
310
+ "output_hidden_states": false,
311
+ "output_scores": false,
312
+ "pad_token_id": 0,
313
+ "prefix": null,
314
+ "problem_type": null,
315
+ "pruned_heads": {},
316
+ "remove_invalid_values": false,
317
+ "repetition_penalty": 1.0,
318
+ "return_dict": true,
319
+ "return_dict_in_generate": false,
320
+ "sep_token_id": null,
321
+ "suppress_tokens": null,
322
+ "task_specific_params": null,
323
+ "temperature": 1.0,
324
+ "tf_legacy_loss": false,
325
+ "tie_encoder_decoder": false,
326
+ "tie_input_output_embeddings": false,
327
+ "tie_word_embeddings": false,
328
+ "tokenizer_class": null,
329
+ "top_k": 50,
330
+ "top_p": 1.0,
331
+ "torchscript": false,
332
+ "transformers_version": "4.57.3",
333
+ "typical_p": 1.0,
334
+ "use_bfloat16": false,
335
+ "use_cache": true,
336
+ "vocab_size": 256000
337
+ },
338
+ "t5gemma_model_name": "google/t5gemma-2b-2b-ul2",
339
+ "text_guard_frames_per_token": 0,
340
+ "text_input_type": "text",
341
+ "text_tokenizer_name": "google/t5gemma-2b-2b-ul2",
342
+ "tie_input_output_embeddings": false,
343
+ "tie_word_embeddings": false,
344
+ "transformers_version": "4.57.3",
345
+ "use_pm_rope": 1,
346
+ "x_sep_token": 255999,
347
+ "xcodec2_model_name": "NandemoGHS/Anime-XCodec2-44.1kHz-v2",
348
+ "y_sep_token": 65540
349
+ }
aratako_tts/T5Gemma-TTS-2b-2b/configuration_t5gemma_voice.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for inference-only T5GemmaVoice model.
3
+
4
+ Kept intentionally minimal: only fields that affect inference-time shapes
5
+ or sampling behaviour are retained.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Dict, List, Optional, Union
11
+
12
+ from transformers.configuration_utils import PretrainedConfig
13
+
14
+
15
+ class T5GemmaVoiceConfig(PretrainedConfig):
16
+ model_type = "t5gemma_voice"
17
+ is_encoder_decoder = True
18
+
19
+ def __init__(
20
+ self,
21
+ # backbone
22
+ t5gemma_model_name: str = "google/t5gemma-2b-2b-ul2",
23
+ t5_config_dict: Optional[Dict[str, Any]] = None,
24
+ attn_implementation: str = "eager",
25
+ precision: str = "float32",
26
+ prune_text_modules: int = 0,
27
+ use_pm_rope: int = 1,
28
+ tie_word_embeddings: Optional[bool] = None,
29
+ tie_input_output_embeddings: Optional[bool] = None,
30
+ n_codebooks: int = 1,
31
+ audio_vocab_size: Union[int, List[int]] = 65536,
32
+ n_special: int = 5,
33
+ empty_token: int = 65536,
34
+ eog: int = 65537,
35
+ eos: int = 65539,
36
+ audio_pad_token: int = 65538,
37
+ audio_mask_token: int = 1024,
38
+ y_sep_token: int = 65540,
39
+ x_sep_token: int = 255999,
40
+ special_first: int = 0,
41
+ encodec_sr: float = 50.0,
42
+ progress_scale: float = 2000.0,
43
+ progress_lookahead_secs: float = 2.0,
44
+ extra_cutoff: float = 5.0,
45
+ text_guard_frames_per_token: int = 0,
46
+ add_eos_to_text: int = 0,
47
+ add_bos_to_text: int = 0,
48
+ parallel_pattern: int = 0,
49
+ audio_max_length: float = 40.0,
50
+ audio_tokenizer: str = "xcodec2",
51
+ xcodec2_model_name: Optional[str] = None,
52
+ codec_audio_sr: Optional[float] = None,
53
+ text_tokenizer_name: Optional[str] = None,
54
+ # misc
55
+ **kwargs,
56
+ ) -> None:
57
+ kwargs = dict(kwargs)
58
+ # avoid duplicate values when loading from config.json that already stores these ids
59
+ for _key in ("bos_token_id", "eos_token_id", "pad_token_id"):
60
+ kwargs.pop(_key, None)
61
+
62
+ super().__init__(
63
+ bos_token_id=empty_token,
64
+ eos_token_id=eos,
65
+ pad_token_id=audio_pad_token,
66
+ **kwargs,
67
+ )
68
+
69
+ # store backbone config for offline instantiation
70
+ self.t5_config_dict = t5_config_dict
71
+ self.t5gemma_model_name = t5gemma_model_name
72
+ self.attn_implementation = attn_implementation
73
+ self.precision = precision
74
+ self.prune_text_modules = prune_text_modules
75
+ self.use_pm_rope = use_pm_rope
76
+ self.tie_word_embeddings = tie_word_embeddings
77
+ self.tie_input_output_embeddings = tie_input_output_embeddings
78
+
79
+ self.text_input_type = "text"
80
+ self.n_codebooks = n_codebooks
81
+ self.audio_vocab_size = audio_vocab_size
82
+ self.n_special = n_special
83
+ self.empty_token = empty_token
84
+ self.eog = eog
85
+ self.eos = eos
86
+ self.audio_pad_token = audio_pad_token
87
+ self.audio_mask_token = audio_mask_token
88
+ self.y_sep_token = y_sep_token
89
+ self.x_sep_token = x_sep_token
90
+ self.special_first = special_first
91
+ self.encodec_sr = encodec_sr
92
+ self.progress_scale = progress_scale
93
+ self.progress_lookahead_secs = progress_lookahead_secs
94
+ self.extra_cutoff = extra_cutoff
95
+ self.text_guard_frames_per_token = text_guard_frames_per_token
96
+ self.add_eos_to_text = add_eos_to_text
97
+ self.add_bos_to_text = add_bos_to_text
98
+ self.parallel_pattern = parallel_pattern
99
+ self.audio_max_length = audio_max_length
100
+ self.audio_tokenizer = audio_tokenizer
101
+ self.xcodec2_model_name = xcodec2_model_name
102
+ self.codec_audio_sr = codec_audio_sr
103
+ self.text_tokenizer_name = text_tokenizer_name
104
+
105
+ # tell Auto* which files to load when trust_remote_code=True
106
+ self.auto_map = {
107
+ "AutoConfig": "configuration_t5gemma_voice.T5GemmaVoiceConfig",
108
+ "AutoModelForSeq2SeqLM": "modeling_t5gemma_voice.T5GemmaVoiceForConditionalGeneration",
109
+ }
110
+
111
+ @property
112
+ def audio_vocab_sizes(self) -> List[int]:
113
+ """Utility to normalize audio_vocab_size to list form."""
114
+ if isinstance(self.audio_vocab_size, list):
115
+ return list(self.audio_vocab_size)
116
+ return [int(self.audio_vocab_size)] * int(self.n_codebooks)
117
+
118
+
119
+ __all__ = ["T5GemmaVoiceConfig"]
aratako_tts/T5Gemma-TTS-2b-2b/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 65536,
4
+ "eos_token_id": 65539,
5
+ "pad_token_id": 65538,
6
+ "transformers_version": "4.57.3"
7
+ }
aratako_tts/T5Gemma-TTS-2b-2b/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12a5d05741e9c00c966521b5692810785f882a5141d4ae440deefcdd2886cab5
3
+ size 4988044752
aratako_tts/T5Gemma-TTS-2b-2b/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f548b2f61fb3f0771913c1e6f03dc387ec9a6dc4b7a63050b10e1e3f72ae0c29
3
+ size 4997791840
aratako_tts/T5Gemma-TTS-2b-2b/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2c60fba107fb9ee6051c4fbe69ad65f42312d1765d559c614ba912f471ee816
3
+ size 643100482
aratako_tts/T5Gemma-TTS-2b-2b/model.safetensors.index.json ADDED
@@ -0,0 +1,744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 5314418949,
4
+ "total_size": 10628837898
5
+ },
6
+ "weight_map": {
7
+ "audio_embedding.0.weight": "model-00003-of-00003.safetensors",
8
+ "backbone.model.decoder.layers.0.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
9
+ "backbone.model.decoder.layers.0.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
10
+ "backbone.model.decoder.layers.0.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
11
+ "backbone.model.decoder.layers.0.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
12
+ "backbone.model.decoder.layers.0.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
13
+ "backbone.model.decoder.layers.0.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
14
+ "backbone.model.decoder.layers.0.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
15
+ "backbone.model.decoder.layers.0.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
16
+ "backbone.model.decoder.layers.0.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
17
+ "backbone.model.decoder.layers.0.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
18
+ "backbone.model.decoder.layers.0.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
19
+ "backbone.model.decoder.layers.0.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
20
+ "backbone.model.decoder.layers.0.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
21
+ "backbone.model.decoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
22
+ "backbone.model.decoder.layers.0.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
23
+ "backbone.model.decoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
24
+ "backbone.model.decoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
25
+ "backbone.model.decoder.layers.1.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
26
+ "backbone.model.decoder.layers.1.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
27
+ "backbone.model.decoder.layers.1.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
28
+ "backbone.model.decoder.layers.1.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
29
+ "backbone.model.decoder.layers.1.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
30
+ "backbone.model.decoder.layers.1.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
31
+ "backbone.model.decoder.layers.1.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
32
+ "backbone.model.decoder.layers.1.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
33
+ "backbone.model.decoder.layers.1.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
34
+ "backbone.model.decoder.layers.1.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
35
+ "backbone.model.decoder.layers.1.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
36
+ "backbone.model.decoder.layers.1.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
37
+ "backbone.model.decoder.layers.1.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
38
+ "backbone.model.decoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
39
+ "backbone.model.decoder.layers.1.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
40
+ "backbone.model.decoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
41
+ "backbone.model.decoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
42
+ "backbone.model.decoder.layers.10.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
43
+ "backbone.model.decoder.layers.10.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
44
+ "backbone.model.decoder.layers.10.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
45
+ "backbone.model.decoder.layers.10.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
46
+ "backbone.model.decoder.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
47
+ "backbone.model.decoder.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
48
+ "backbone.model.decoder.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
49
+ "backbone.model.decoder.layers.10.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
50
+ "backbone.model.decoder.layers.10.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
51
+ "backbone.model.decoder.layers.10.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
52
+ "backbone.model.decoder.layers.10.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
53
+ "backbone.model.decoder.layers.10.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
54
+ "backbone.model.decoder.layers.10.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
55
+ "backbone.model.decoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
56
+ "backbone.model.decoder.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
57
+ "backbone.model.decoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
58
+ "backbone.model.decoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
59
+ "backbone.model.decoder.layers.11.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
60
+ "backbone.model.decoder.layers.11.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
61
+ "backbone.model.decoder.layers.11.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
62
+ "backbone.model.decoder.layers.11.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
63
+ "backbone.model.decoder.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
+ "backbone.model.decoder.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
+ "backbone.model.decoder.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
+ "backbone.model.decoder.layers.11.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
67
+ "backbone.model.decoder.layers.11.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
68
+ "backbone.model.decoder.layers.11.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
69
+ "backbone.model.decoder.layers.11.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
70
+ "backbone.model.decoder.layers.11.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
71
+ "backbone.model.decoder.layers.11.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "backbone.model.decoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
73
+ "backbone.model.decoder.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
74
+ "backbone.model.decoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
75
+ "backbone.model.decoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
76
+ "backbone.model.decoder.layers.12.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
+ "backbone.model.decoder.layers.12.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
+ "backbone.model.decoder.layers.12.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
+ "backbone.model.decoder.layers.12.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "backbone.model.decoder.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
81
+ "backbone.model.decoder.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
82
+ "backbone.model.decoder.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
83
+ "backbone.model.decoder.layers.12.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
84
+ "backbone.model.decoder.layers.12.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "backbone.model.decoder.layers.12.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
86
+ "backbone.model.decoder.layers.12.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
87
+ "backbone.model.decoder.layers.12.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
88
+ "backbone.model.decoder.layers.12.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
89
+ "backbone.model.decoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
90
+ "backbone.model.decoder.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
91
+ "backbone.model.decoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
92
+ "backbone.model.decoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
93
+ "backbone.model.decoder.layers.13.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
94
+ "backbone.model.decoder.layers.13.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
95
+ "backbone.model.decoder.layers.13.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
96
+ "backbone.model.decoder.layers.13.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
97
+ "backbone.model.decoder.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
98
+ "backbone.model.decoder.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
99
+ "backbone.model.decoder.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
100
+ "backbone.model.decoder.layers.13.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
101
+ "backbone.model.decoder.layers.13.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
102
+ "backbone.model.decoder.layers.13.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
103
+ "backbone.model.decoder.layers.13.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
104
+ "backbone.model.decoder.layers.13.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
105
+ "backbone.model.decoder.layers.13.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
106
+ "backbone.model.decoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
107
+ "backbone.model.decoder.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
108
+ "backbone.model.decoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
109
+ "backbone.model.decoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
110
+ "backbone.model.decoder.layers.14.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
111
+ "backbone.model.decoder.layers.14.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
112
+ "backbone.model.decoder.layers.14.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
113
+ "backbone.model.decoder.layers.14.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
114
+ "backbone.model.decoder.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
115
+ "backbone.model.decoder.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
116
+ "backbone.model.decoder.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
117
+ "backbone.model.decoder.layers.14.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
118
+ "backbone.model.decoder.layers.14.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
119
+ "backbone.model.decoder.layers.14.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
120
+ "backbone.model.decoder.layers.14.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
121
+ "backbone.model.decoder.layers.14.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
122
+ "backbone.model.decoder.layers.14.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
123
+ "backbone.model.decoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
124
+ "backbone.model.decoder.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
125
+ "backbone.model.decoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
126
+ "backbone.model.decoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
127
+ "backbone.model.decoder.layers.15.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
128
+ "backbone.model.decoder.layers.15.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
129
+ "backbone.model.decoder.layers.15.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
130
+ "backbone.model.decoder.layers.15.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
131
+ "backbone.model.decoder.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
132
+ "backbone.model.decoder.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
133
+ "backbone.model.decoder.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
134
+ "backbone.model.decoder.layers.15.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
135
+ "backbone.model.decoder.layers.15.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
136
+ "backbone.model.decoder.layers.15.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
137
+ "backbone.model.decoder.layers.15.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
138
+ "backbone.model.decoder.layers.15.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "backbone.model.decoder.layers.15.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
140
+ "backbone.model.decoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
141
+ "backbone.model.decoder.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
142
+ "backbone.model.decoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
143
+ "backbone.model.decoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
144
+ "backbone.model.decoder.layers.16.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
145
+ "backbone.model.decoder.layers.16.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
146
+ "backbone.model.decoder.layers.16.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
147
+ "backbone.model.decoder.layers.16.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
148
+ "backbone.model.decoder.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
149
+ "backbone.model.decoder.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
150
+ "backbone.model.decoder.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
151
+ "backbone.model.decoder.layers.16.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
152
+ "backbone.model.decoder.layers.16.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
153
+ "backbone.model.decoder.layers.16.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
154
+ "backbone.model.decoder.layers.16.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
155
+ "backbone.model.decoder.layers.16.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
156
+ "backbone.model.decoder.layers.16.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
157
+ "backbone.model.decoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
158
+ "backbone.model.decoder.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
159
+ "backbone.model.decoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
160
+ "backbone.model.decoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
161
+ "backbone.model.decoder.layers.17.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
162
+ "backbone.model.decoder.layers.17.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
163
+ "backbone.model.decoder.layers.17.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
164
+ "backbone.model.decoder.layers.17.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
165
+ "backbone.model.decoder.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
166
+ "backbone.model.decoder.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
167
+ "backbone.model.decoder.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
168
+ "backbone.model.decoder.layers.17.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
169
+ "backbone.model.decoder.layers.17.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
170
+ "backbone.model.decoder.layers.17.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
171
+ "backbone.model.decoder.layers.17.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
172
+ "backbone.model.decoder.layers.17.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
173
+ "backbone.model.decoder.layers.17.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
174
+ "backbone.model.decoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
175
+ "backbone.model.decoder.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
176
+ "backbone.model.decoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
177
+ "backbone.model.decoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
178
+ "backbone.model.decoder.layers.18.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
179
+ "backbone.model.decoder.layers.18.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
180
+ "backbone.model.decoder.layers.18.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
181
+ "backbone.model.decoder.layers.18.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
182
+ "backbone.model.decoder.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
183
+ "backbone.model.decoder.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
184
+ "backbone.model.decoder.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
185
+ "backbone.model.decoder.layers.18.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
186
+ "backbone.model.decoder.layers.18.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
187
+ "backbone.model.decoder.layers.18.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
188
+ "backbone.model.decoder.layers.18.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
189
+ "backbone.model.decoder.layers.18.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
190
+ "backbone.model.decoder.layers.18.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
191
+ "backbone.model.decoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
192
+ "backbone.model.decoder.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
193
+ "backbone.model.decoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
194
+ "backbone.model.decoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
195
+ "backbone.model.decoder.layers.19.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
196
+ "backbone.model.decoder.layers.19.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
197
+ "backbone.model.decoder.layers.19.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
198
+ "backbone.model.decoder.layers.19.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
199
+ "backbone.model.decoder.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
200
+ "backbone.model.decoder.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
201
+ "backbone.model.decoder.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
202
+ "backbone.model.decoder.layers.19.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
203
+ "backbone.model.decoder.layers.19.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
204
+ "backbone.model.decoder.layers.19.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
205
+ "backbone.model.decoder.layers.19.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
206
+ "backbone.model.decoder.layers.19.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
207
+ "backbone.model.decoder.layers.19.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
208
+ "backbone.model.decoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
209
+ "backbone.model.decoder.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
210
+ "backbone.model.decoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
211
+ "backbone.model.decoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
212
+ "backbone.model.decoder.layers.2.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
213
+ "backbone.model.decoder.layers.2.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
214
+ "backbone.model.decoder.layers.2.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
215
+ "backbone.model.decoder.layers.2.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
216
+ "backbone.model.decoder.layers.2.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
217
+ "backbone.model.decoder.layers.2.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
218
+ "backbone.model.decoder.layers.2.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
219
+ "backbone.model.decoder.layers.2.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
220
+ "backbone.model.decoder.layers.2.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
221
+ "backbone.model.decoder.layers.2.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
222
+ "backbone.model.decoder.layers.2.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
223
+ "backbone.model.decoder.layers.2.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
224
+ "backbone.model.decoder.layers.2.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
225
+ "backbone.model.decoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
226
+ "backbone.model.decoder.layers.2.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
227
+ "backbone.model.decoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
228
+ "backbone.model.decoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
229
+ "backbone.model.decoder.layers.20.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
230
+ "backbone.model.decoder.layers.20.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
231
+ "backbone.model.decoder.layers.20.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
232
+ "backbone.model.decoder.layers.20.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
233
+ "backbone.model.decoder.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
234
+ "backbone.model.decoder.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
235
+ "backbone.model.decoder.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
236
+ "backbone.model.decoder.layers.20.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
237
+ "backbone.model.decoder.layers.20.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
238
+ "backbone.model.decoder.layers.20.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
239
+ "backbone.model.decoder.layers.20.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
240
+ "backbone.model.decoder.layers.20.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
241
+ "backbone.model.decoder.layers.20.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
242
+ "backbone.model.decoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
243
+ "backbone.model.decoder.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
244
+ "backbone.model.decoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
245
+ "backbone.model.decoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
246
+ "backbone.model.decoder.layers.21.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
247
+ "backbone.model.decoder.layers.21.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
248
+ "backbone.model.decoder.layers.21.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
249
+ "backbone.model.decoder.layers.21.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
250
+ "backbone.model.decoder.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
251
+ "backbone.model.decoder.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
252
+ "backbone.model.decoder.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
253
+ "backbone.model.decoder.layers.21.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
254
+ "backbone.model.decoder.layers.21.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
255
+ "backbone.model.decoder.layers.21.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
256
+ "backbone.model.decoder.layers.21.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
257
+ "backbone.model.decoder.layers.21.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
258
+ "backbone.model.decoder.layers.21.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
259
+ "backbone.model.decoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
260
+ "backbone.model.decoder.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
261
+ "backbone.model.decoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
262
+ "backbone.model.decoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
263
+ "backbone.model.decoder.layers.22.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
264
+ "backbone.model.decoder.layers.22.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
265
+ "backbone.model.decoder.layers.22.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
266
+ "backbone.model.decoder.layers.22.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
267
+ "backbone.model.decoder.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
268
+ "backbone.model.decoder.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
269
+ "backbone.model.decoder.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
270
+ "backbone.model.decoder.layers.22.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
271
+ "backbone.model.decoder.layers.22.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
272
+ "backbone.model.decoder.layers.22.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
273
+ "backbone.model.decoder.layers.22.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
274
+ "backbone.model.decoder.layers.22.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
275
+ "backbone.model.decoder.layers.22.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
276
+ "backbone.model.decoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
277
+ "backbone.model.decoder.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
278
+ "backbone.model.decoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
279
+ "backbone.model.decoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
280
+ "backbone.model.decoder.layers.23.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
281
+ "backbone.model.decoder.layers.23.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
282
+ "backbone.model.decoder.layers.23.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
283
+ "backbone.model.decoder.layers.23.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
284
+ "backbone.model.decoder.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
285
+ "backbone.model.decoder.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
286
+ "backbone.model.decoder.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
287
+ "backbone.model.decoder.layers.23.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
288
+ "backbone.model.decoder.layers.23.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
289
+ "backbone.model.decoder.layers.23.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
290
+ "backbone.model.decoder.layers.23.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
291
+ "backbone.model.decoder.layers.23.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
292
+ "backbone.model.decoder.layers.23.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
293
+ "backbone.model.decoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
294
+ "backbone.model.decoder.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
295
+ "backbone.model.decoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
296
+ "backbone.model.decoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
297
+ "backbone.model.decoder.layers.24.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
298
+ "backbone.model.decoder.layers.24.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
299
+ "backbone.model.decoder.layers.24.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
300
+ "backbone.model.decoder.layers.24.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
301
+ "backbone.model.decoder.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
302
+ "backbone.model.decoder.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
303
+ "backbone.model.decoder.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
304
+ "backbone.model.decoder.layers.24.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
305
+ "backbone.model.decoder.layers.24.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
306
+ "backbone.model.decoder.layers.24.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
307
+ "backbone.model.decoder.layers.24.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
308
+ "backbone.model.decoder.layers.24.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
309
+ "backbone.model.decoder.layers.24.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
310
+ "backbone.model.decoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
311
+ "backbone.model.decoder.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
312
+ "backbone.model.decoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
313
+ "backbone.model.decoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
314
+ "backbone.model.decoder.layers.25.cross_attn.k_proj.weight": "model-00003-of-00003.safetensors",
315
+ "backbone.model.decoder.layers.25.cross_attn.o_proj.weight": "model-00003-of-00003.safetensors",
316
+ "backbone.model.decoder.layers.25.cross_attn.q_proj.weight": "model-00003-of-00003.safetensors",
317
+ "backbone.model.decoder.layers.25.cross_attn.v_proj.weight": "model-00003-of-00003.safetensors",
318
+ "backbone.model.decoder.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
319
+ "backbone.model.decoder.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
320
+ "backbone.model.decoder.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
321
+ "backbone.model.decoder.layers.25.post_cross_attn_layernorm.weight": "model-00003-of-00003.safetensors",
322
+ "backbone.model.decoder.layers.25.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
323
+ "backbone.model.decoder.layers.25.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
324
+ "backbone.model.decoder.layers.25.pre_cross_attn_layernorm.weight": "model-00003-of-00003.safetensors",
325
+ "backbone.model.decoder.layers.25.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
326
+ "backbone.model.decoder.layers.25.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
327
+ "backbone.model.decoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
328
+ "backbone.model.decoder.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
329
+ "backbone.model.decoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
330
+ "backbone.model.decoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
331
+ "backbone.model.decoder.layers.3.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
332
+ "backbone.model.decoder.layers.3.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
333
+ "backbone.model.decoder.layers.3.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
334
+ "backbone.model.decoder.layers.3.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
335
+ "backbone.model.decoder.layers.3.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
336
+ "backbone.model.decoder.layers.3.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
337
+ "backbone.model.decoder.layers.3.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
338
+ "backbone.model.decoder.layers.3.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
339
+ "backbone.model.decoder.layers.3.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
340
+ "backbone.model.decoder.layers.3.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
341
+ "backbone.model.decoder.layers.3.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
342
+ "backbone.model.decoder.layers.3.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
343
+ "backbone.model.decoder.layers.3.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
344
+ "backbone.model.decoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
345
+ "backbone.model.decoder.layers.3.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
346
+ "backbone.model.decoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
347
+ "backbone.model.decoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
348
+ "backbone.model.decoder.layers.4.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
349
+ "backbone.model.decoder.layers.4.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
350
+ "backbone.model.decoder.layers.4.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
351
+ "backbone.model.decoder.layers.4.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
352
+ "backbone.model.decoder.layers.4.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
353
+ "backbone.model.decoder.layers.4.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
354
+ "backbone.model.decoder.layers.4.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
355
+ "backbone.model.decoder.layers.4.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
356
+ "backbone.model.decoder.layers.4.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
357
+ "backbone.model.decoder.layers.4.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
358
+ "backbone.model.decoder.layers.4.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
359
+ "backbone.model.decoder.layers.4.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
360
+ "backbone.model.decoder.layers.4.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
361
+ "backbone.model.decoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
362
+ "backbone.model.decoder.layers.4.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
363
+ "backbone.model.decoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
364
+ "backbone.model.decoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
365
+ "backbone.model.decoder.layers.5.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
366
+ "backbone.model.decoder.layers.5.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
367
+ "backbone.model.decoder.layers.5.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
368
+ "backbone.model.decoder.layers.5.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
369
+ "backbone.model.decoder.layers.5.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
370
+ "backbone.model.decoder.layers.5.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
371
+ "backbone.model.decoder.layers.5.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
372
+ "backbone.model.decoder.layers.5.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
373
+ "backbone.model.decoder.layers.5.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
374
+ "backbone.model.decoder.layers.5.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
375
+ "backbone.model.decoder.layers.5.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
376
+ "backbone.model.decoder.layers.5.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
377
+ "backbone.model.decoder.layers.5.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
378
+ "backbone.model.decoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
379
+ "backbone.model.decoder.layers.5.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
380
+ "backbone.model.decoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
381
+ "backbone.model.decoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
382
+ "backbone.model.decoder.layers.6.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
383
+ "backbone.model.decoder.layers.6.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
384
+ "backbone.model.decoder.layers.6.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
385
+ "backbone.model.decoder.layers.6.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
386
+ "backbone.model.decoder.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
387
+ "backbone.model.decoder.layers.6.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
388
+ "backbone.model.decoder.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
389
+ "backbone.model.decoder.layers.6.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
390
+ "backbone.model.decoder.layers.6.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
391
+ "backbone.model.decoder.layers.6.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
392
+ "backbone.model.decoder.layers.6.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
393
+ "backbone.model.decoder.layers.6.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
394
+ "backbone.model.decoder.layers.6.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
395
+ "backbone.model.decoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
396
+ "backbone.model.decoder.layers.6.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
397
+ "backbone.model.decoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
398
+ "backbone.model.decoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
399
+ "backbone.model.decoder.layers.7.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
400
+ "backbone.model.decoder.layers.7.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
401
+ "backbone.model.decoder.layers.7.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
402
+ "backbone.model.decoder.layers.7.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
403
+ "backbone.model.decoder.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
404
+ "backbone.model.decoder.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
405
+ "backbone.model.decoder.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
406
+ "backbone.model.decoder.layers.7.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
407
+ "backbone.model.decoder.layers.7.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
408
+ "backbone.model.decoder.layers.7.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
409
+ "backbone.model.decoder.layers.7.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
410
+ "backbone.model.decoder.layers.7.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
411
+ "backbone.model.decoder.layers.7.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
412
+ "backbone.model.decoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
413
+ "backbone.model.decoder.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
414
+ "backbone.model.decoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
415
+ "backbone.model.decoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
416
+ "backbone.model.decoder.layers.8.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
417
+ "backbone.model.decoder.layers.8.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
418
+ "backbone.model.decoder.layers.8.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
419
+ "backbone.model.decoder.layers.8.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
420
+ "backbone.model.decoder.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
421
+ "backbone.model.decoder.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
422
+ "backbone.model.decoder.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
423
+ "backbone.model.decoder.layers.8.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
424
+ "backbone.model.decoder.layers.8.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
425
+ "backbone.model.decoder.layers.8.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
426
+ "backbone.model.decoder.layers.8.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
427
+ "backbone.model.decoder.layers.8.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
428
+ "backbone.model.decoder.layers.8.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
429
+ "backbone.model.decoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
430
+ "backbone.model.decoder.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
431
+ "backbone.model.decoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
432
+ "backbone.model.decoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
433
+ "backbone.model.decoder.layers.9.cross_attn.k_proj.weight": "model-00002-of-00003.safetensors",
434
+ "backbone.model.decoder.layers.9.cross_attn.o_proj.weight": "model-00002-of-00003.safetensors",
435
+ "backbone.model.decoder.layers.9.cross_attn.q_proj.weight": "model-00002-of-00003.safetensors",
436
+ "backbone.model.decoder.layers.9.cross_attn.v_proj.weight": "model-00002-of-00003.safetensors",
437
+ "backbone.model.decoder.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
438
+ "backbone.model.decoder.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
439
+ "backbone.model.decoder.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
440
+ "backbone.model.decoder.layers.9.post_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
441
+ "backbone.model.decoder.layers.9.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
442
+ "backbone.model.decoder.layers.9.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
443
+ "backbone.model.decoder.layers.9.pre_cross_attn_layernorm.weight": "model-00002-of-00003.safetensors",
444
+ "backbone.model.decoder.layers.9.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
445
+ "backbone.model.decoder.layers.9.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
446
+ "backbone.model.decoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
447
+ "backbone.model.decoder.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
448
+ "backbone.model.decoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
449
+ "backbone.model.decoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
450
+ "backbone.model.decoder.norm.weight": "model-00002-of-00003.safetensors",
451
+ "backbone.model.encoder.embed_tokens.weight": "model-00001-of-00003.safetensors",
452
+ "backbone.model.encoder.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
453
+ "backbone.model.encoder.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
454
+ "backbone.model.encoder.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
455
+ "backbone.model.encoder.layers.0.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
456
+ "backbone.model.encoder.layers.0.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
457
+ "backbone.model.encoder.layers.0.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
458
+ "backbone.model.encoder.layers.0.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
459
+ "backbone.model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
460
+ "backbone.model.encoder.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
461
+ "backbone.model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
462
+ "backbone.model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
463
+ "backbone.model.encoder.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
464
+ "backbone.model.encoder.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
465
+ "backbone.model.encoder.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
466
+ "backbone.model.encoder.layers.1.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
467
+ "backbone.model.encoder.layers.1.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
468
+ "backbone.model.encoder.layers.1.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
469
+ "backbone.model.encoder.layers.1.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
470
+ "backbone.model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
471
+ "backbone.model.encoder.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
472
+ "backbone.model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
473
+ "backbone.model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
474
+ "backbone.model.encoder.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
475
+ "backbone.model.encoder.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
476
+ "backbone.model.encoder.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
477
+ "backbone.model.encoder.layers.10.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
478
+ "backbone.model.encoder.layers.10.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
479
+ "backbone.model.encoder.layers.10.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
480
+ "backbone.model.encoder.layers.10.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
481
+ "backbone.model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
482
+ "backbone.model.encoder.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
483
+ "backbone.model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
484
+ "backbone.model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
485
+ "backbone.model.encoder.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
486
+ "backbone.model.encoder.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
487
+ "backbone.model.encoder.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
488
+ "backbone.model.encoder.layers.11.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
489
+ "backbone.model.encoder.layers.11.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
490
+ "backbone.model.encoder.layers.11.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
491
+ "backbone.model.encoder.layers.11.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
492
+ "backbone.model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
493
+ "backbone.model.encoder.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
494
+ "backbone.model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
495
+ "backbone.model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
496
+ "backbone.model.encoder.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
497
+ "backbone.model.encoder.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
498
+ "backbone.model.encoder.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
499
+ "backbone.model.encoder.layers.12.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
500
+ "backbone.model.encoder.layers.12.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
501
+ "backbone.model.encoder.layers.12.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
502
+ "backbone.model.encoder.layers.12.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
503
+ "backbone.model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
504
+ "backbone.model.encoder.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
505
+ "backbone.model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
506
+ "backbone.model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
507
+ "backbone.model.encoder.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
508
+ "backbone.model.encoder.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
509
+ "backbone.model.encoder.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
510
+ "backbone.model.encoder.layers.13.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
511
+ "backbone.model.encoder.layers.13.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
512
+ "backbone.model.encoder.layers.13.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
513
+ "backbone.model.encoder.layers.13.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
514
+ "backbone.model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
515
+ "backbone.model.encoder.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
516
+ "backbone.model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
517
+ "backbone.model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
518
+ "backbone.model.encoder.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
519
+ "backbone.model.encoder.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
520
+ "backbone.model.encoder.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
521
+ "backbone.model.encoder.layers.14.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
522
+ "backbone.model.encoder.layers.14.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
523
+ "backbone.model.encoder.layers.14.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
524
+ "backbone.model.encoder.layers.14.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
525
+ "backbone.model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
526
+ "backbone.model.encoder.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
527
+ "backbone.model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
528
+ "backbone.model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
529
+ "backbone.model.encoder.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
530
+ "backbone.model.encoder.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
531
+ "backbone.model.encoder.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
532
+ "backbone.model.encoder.layers.15.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
533
+ "backbone.model.encoder.layers.15.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
534
+ "backbone.model.encoder.layers.15.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
535
+ "backbone.model.encoder.layers.15.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
536
+ "backbone.model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
537
+ "backbone.model.encoder.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
538
+ "backbone.model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
539
+ "backbone.model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
540
+ "backbone.model.encoder.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
541
+ "backbone.model.encoder.layers.16.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
542
+ "backbone.model.encoder.layers.16.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
543
+ "backbone.model.encoder.layers.16.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
544
+ "backbone.model.encoder.layers.16.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
545
+ "backbone.model.encoder.layers.16.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
546
+ "backbone.model.encoder.layers.16.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
547
+ "backbone.model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
548
+ "backbone.model.encoder.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
549
+ "backbone.model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
550
+ "backbone.model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
551
+ "backbone.model.encoder.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
552
+ "backbone.model.encoder.layers.17.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
553
+ "backbone.model.encoder.layers.17.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
554
+ "backbone.model.encoder.layers.17.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
555
+ "backbone.model.encoder.layers.17.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
556
+ "backbone.model.encoder.layers.17.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
557
+ "backbone.model.encoder.layers.17.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
558
+ "backbone.model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
559
+ "backbone.model.encoder.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
560
+ "backbone.model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
561
+ "backbone.model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
562
+ "backbone.model.encoder.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
563
+ "backbone.model.encoder.layers.18.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
564
+ "backbone.model.encoder.layers.18.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
565
+ "backbone.model.encoder.layers.18.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
566
+ "backbone.model.encoder.layers.18.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
567
+ "backbone.model.encoder.layers.18.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
568
+ "backbone.model.encoder.layers.18.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
569
+ "backbone.model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
570
+ "backbone.model.encoder.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
571
+ "backbone.model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
572
+ "backbone.model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
573
+ "backbone.model.encoder.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
574
+ "backbone.model.encoder.layers.19.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
575
+ "backbone.model.encoder.layers.19.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
576
+ "backbone.model.encoder.layers.19.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
577
+ "backbone.model.encoder.layers.19.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
578
+ "backbone.model.encoder.layers.19.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
579
+ "backbone.model.encoder.layers.19.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
580
+ "backbone.model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
581
+ "backbone.model.encoder.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
582
+ "backbone.model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
583
+ "backbone.model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
584
+ "backbone.model.encoder.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
585
+ "backbone.model.encoder.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
586
+ "backbone.model.encoder.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
587
+ "backbone.model.encoder.layers.2.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
588
+ "backbone.model.encoder.layers.2.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
589
+ "backbone.model.encoder.layers.2.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
590
+ "backbone.model.encoder.layers.2.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
591
+ "backbone.model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
592
+ "backbone.model.encoder.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
593
+ "backbone.model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
594
+ "backbone.model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
595
+ "backbone.model.encoder.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
596
+ "backbone.model.encoder.layers.20.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
597
+ "backbone.model.encoder.layers.20.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
598
+ "backbone.model.encoder.layers.20.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
599
+ "backbone.model.encoder.layers.20.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
600
+ "backbone.model.encoder.layers.20.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
601
+ "backbone.model.encoder.layers.20.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
602
+ "backbone.model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
603
+ "backbone.model.encoder.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
604
+ "backbone.model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
605
+ "backbone.model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
606
+ "backbone.model.encoder.layers.21.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
607
+ "backbone.model.encoder.layers.21.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
608
+ "backbone.model.encoder.layers.21.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
609
+ "backbone.model.encoder.layers.21.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
610
+ "backbone.model.encoder.layers.21.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
611
+ "backbone.model.encoder.layers.21.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
612
+ "backbone.model.encoder.layers.21.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
613
+ "backbone.model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
614
+ "backbone.model.encoder.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
615
+ "backbone.model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
616
+ "backbone.model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
617
+ "backbone.model.encoder.layers.22.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
618
+ "backbone.model.encoder.layers.22.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
619
+ "backbone.model.encoder.layers.22.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
620
+ "backbone.model.encoder.layers.22.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
621
+ "backbone.model.encoder.layers.22.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
622
+ "backbone.model.encoder.layers.22.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
623
+ "backbone.model.encoder.layers.22.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
624
+ "backbone.model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
625
+ "backbone.model.encoder.layers.22.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
626
+ "backbone.model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
627
+ "backbone.model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
628
+ "backbone.model.encoder.layers.23.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
629
+ "backbone.model.encoder.layers.23.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
630
+ "backbone.model.encoder.layers.23.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
631
+ "backbone.model.encoder.layers.23.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
632
+ "backbone.model.encoder.layers.23.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
633
+ "backbone.model.encoder.layers.23.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
634
+ "backbone.model.encoder.layers.23.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
635
+ "backbone.model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
636
+ "backbone.model.encoder.layers.23.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
637
+ "backbone.model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
638
+ "backbone.model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
639
+ "backbone.model.encoder.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
640
+ "backbone.model.encoder.layers.24.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
641
+ "backbone.model.encoder.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
642
+ "backbone.model.encoder.layers.24.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
643
+ "backbone.model.encoder.layers.24.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
644
+ "backbone.model.encoder.layers.24.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
645
+ "backbone.model.encoder.layers.24.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
646
+ "backbone.model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
647
+ "backbone.model.encoder.layers.24.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
648
+ "backbone.model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
649
+ "backbone.model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
650
+ "backbone.model.encoder.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
651
+ "backbone.model.encoder.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
652
+ "backbone.model.encoder.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
653
+ "backbone.model.encoder.layers.25.post_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
654
+ "backbone.model.encoder.layers.25.post_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
655
+ "backbone.model.encoder.layers.25.pre_feedforward_layernorm.weight": "model-00002-of-00003.safetensors",
656
+ "backbone.model.encoder.layers.25.pre_self_attn_layernorm.weight": "model-00002-of-00003.safetensors",
657
+ "backbone.model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
658
+ "backbone.model.encoder.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
659
+ "backbone.model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
660
+ "backbone.model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
661
+ "backbone.model.encoder.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
662
+ "backbone.model.encoder.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
663
+ "backbone.model.encoder.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
664
+ "backbone.model.encoder.layers.3.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
665
+ "backbone.model.encoder.layers.3.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
666
+ "backbone.model.encoder.layers.3.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
667
+ "backbone.model.encoder.layers.3.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
668
+ "backbone.model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
669
+ "backbone.model.encoder.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
670
+ "backbone.model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
671
+ "backbone.model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
672
+ "backbone.model.encoder.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
673
+ "backbone.model.encoder.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
674
+ "backbone.model.encoder.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
675
+ "backbone.model.encoder.layers.4.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
676
+ "backbone.model.encoder.layers.4.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
677
+ "backbone.model.encoder.layers.4.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
678
+ "backbone.model.encoder.layers.4.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
679
+ "backbone.model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
680
+ "backbone.model.encoder.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
681
+ "backbone.model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
682
+ "backbone.model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
683
+ "backbone.model.encoder.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
684
+ "backbone.model.encoder.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
685
+ "backbone.model.encoder.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
686
+ "backbone.model.encoder.layers.5.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
687
+ "backbone.model.encoder.layers.5.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
688
+ "backbone.model.encoder.layers.5.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
689
+ "backbone.model.encoder.layers.5.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
690
+ "backbone.model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
691
+ "backbone.model.encoder.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
692
+ "backbone.model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
693
+ "backbone.model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
694
+ "backbone.model.encoder.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
695
+ "backbone.model.encoder.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
696
+ "backbone.model.encoder.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
697
+ "backbone.model.encoder.layers.6.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
698
+ "backbone.model.encoder.layers.6.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
699
+ "backbone.model.encoder.layers.6.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
700
+ "backbone.model.encoder.layers.6.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
701
+ "backbone.model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
702
+ "backbone.model.encoder.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
703
+ "backbone.model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
704
+ "backbone.model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
705
+ "backbone.model.encoder.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
706
+ "backbone.model.encoder.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
707
+ "backbone.model.encoder.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
708
+ "backbone.model.encoder.layers.7.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
709
+ "backbone.model.encoder.layers.7.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
710
+ "backbone.model.encoder.layers.7.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
711
+ "backbone.model.encoder.layers.7.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
712
+ "backbone.model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
713
+ "backbone.model.encoder.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
714
+ "backbone.model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
715
+ "backbone.model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
716
+ "backbone.model.encoder.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
717
+ "backbone.model.encoder.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
718
+ "backbone.model.encoder.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
719
+ "backbone.model.encoder.layers.8.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
720
+ "backbone.model.encoder.layers.8.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
721
+ "backbone.model.encoder.layers.8.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
722
+ "backbone.model.encoder.layers.8.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
723
+ "backbone.model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
724
+ "backbone.model.encoder.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
725
+ "backbone.model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
726
+ "backbone.model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
727
+ "backbone.model.encoder.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
728
+ "backbone.model.encoder.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
729
+ "backbone.model.encoder.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
730
+ "backbone.model.encoder.layers.9.post_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
731
+ "backbone.model.encoder.layers.9.post_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
732
+ "backbone.model.encoder.layers.9.pre_feedforward_layernorm.weight": "model-00001-of-00003.safetensors",
733
+ "backbone.model.encoder.layers.9.pre_self_attn_layernorm.weight": "model-00001-of-00003.safetensors",
734
+ "backbone.model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
735
+ "backbone.model.encoder.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
736
+ "backbone.model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
737
+ "backbone.model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
738
+ "backbone.model.encoder.norm.weight": "model-00001-of-00003.safetensors",
739
+ "predict_layer.0.0.bias": "model-00003-of-00003.safetensors",
740
+ "predict_layer.0.0.weight": "model-00003-of-00003.safetensors",
741
+ "predict_layer.0.2.bias": "model-00003-of-00003.safetensors",
742
+ "predict_layer.0.2.weight": "model-00003-of-00003.safetensors"
743
+ }
744
+ }
aratako_tts/T5Gemma-TTS-2b-2b/modeling_t5gemma_voice.py ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face compatible wrapper of the T5Gemma-TTS model.
3
+
4
+ This is largely a drop-in copy of `models/t5gemma.py`, but inherits
5
+ `PreTrainedModel` so that it can be loaded via `AutoModelForSeq2SeqLM` with
6
+ `trust_remote_code=True`. Only the inference-oriented pieces are kept intact.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ from typing import Callable, Dict, List, Optional, Tuple, Union
13
+
14
+ import torch
15
+ import torch.nn as nn
16
+ import torch.nn.functional as F
17
+ from transformers import AutoModelForSeq2SeqLM, PreTrainedModel
18
+ from transformers.cache_utils import Cache
19
+ from transformers.generation import GenerationMixin
20
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
21
+ from transformers.models.t5gemma.modeling_t5gemma import (
22
+ ALL_ATTENTION_FUNCTIONS,
23
+ EncoderDecoderCache,
24
+ T5GemmaCrossAttention,
25
+ T5GemmaDecoderLayer,
26
+ T5GemmaRotaryEmbedding,
27
+ eager_attention_forward,
28
+ rotate_half,
29
+ )
30
+
31
+ try:
32
+ from .configuration_t5gemma_voice import T5GemmaVoiceConfig
33
+ except ImportError: # when executed inside the repo package
34
+ from hf_export.configuration_t5gemma_voice import T5GemmaVoiceConfig
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
40
+ """Return Bool mask [B, T] where True indicates padding."""
41
+ assert lengths.ndim == 1, lengths.ndim
42
+ max_len = max(max_len, lengths.max())
43
+ n = lengths.size(0)
44
+ seq_range = torch.arange(0, max_len, device=lengths.device)
45
+ expanded_lengths = seq_range.unsqueeze(0).expand(n, max_len)
46
+ return expanded_lengths >= lengths.unsqueeze(-1)
47
+
48
+
49
+ def top_k_top_p_filtering(
50
+ logits,
51
+ top_k=0,
52
+ top_p=1.0,
53
+ min_p=0.0,
54
+ filter_value=-float("Inf"),
55
+ min_tokens_to_keep=1,
56
+ ):
57
+ min_p_enabled = 0.0 < min_p < 1.0
58
+ if min_p_enabled:
59
+ probs = F.softmax(logits, dim=-1)
60
+ indices_to_remove = probs < min_p
61
+ if torch.all(indices_to_remove.sum(-1) < logits.size(-1)):
62
+ logits = logits.masked_fill(indices_to_remove, filter_value)
63
+ top_k = 0
64
+ top_p = 1.0
65
+
66
+ if isinstance(top_k, int) and top_k > 0:
67
+ top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))
68
+ threshold = torch.topk(logits, top_k, dim=-1)[0][..., -1, None]
69
+ indices_to_remove = logits < threshold
70
+ logits[indices_to_remove] = filter_value
71
+ elif isinstance(top_k, list):
72
+ assert len(top_k) == logits.size(
73
+ 0
74
+ ), f"top_k list length ({len(top_k)}) must match logits.size(0) ({logits.size(0)})"
75
+ for i in range(logits.size(0)):
76
+ k_i = top_k[i]
77
+ if k_i > 0:
78
+ k_i = min(max(k_i, min_tokens_to_keep), logits.size(-1))
79
+ row_threshold = torch.topk(logits[i], k_i, dim=-1)[0][-1]
80
+ indices_to_remove_i = logits[i] < row_threshold
81
+ logits[i, indices_to_remove_i] = filter_value
82
+
83
+ if top_p < 1.0:
84
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
85
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
86
+ sorted_indices_to_remove = cumulative_probs > top_p
87
+ if min_tokens_to_keep > 1:
88
+ sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
89
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
90
+ sorted_indices_to_remove[..., 0] = 0
91
+
92
+ indices_to_remove = torch.zeros_like(logits, dtype=torch.bool)
93
+ indices_to_remove.scatter_(dim=-1, index=sorted_indices, src=sorted_indices_to_remove)
94
+ logits = logits.masked_fill(indices_to_remove, filter_value)
95
+ return logits
96
+
97
+
98
+ def topk_sampling(logits, top_k=10, top_p=1.0, min_p=0.0, temperature=1.0):
99
+ if temperature != 1.0:
100
+ logits = logits / temperature
101
+ logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p, min_p=min_p)
102
+ token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
103
+ return token
104
+
105
+
106
+ class PMCrossAttention(T5GemmaCrossAttention):
107
+ """T5Gemma cross-attention augmented with Progress-Monitoring RoPE."""
108
+
109
+ def __init__(self, config, layer_idx: int):
110
+ super().__init__(config=config, layer_idx=layer_idx)
111
+ self.decoder_rotary_emb = T5GemmaRotaryEmbedding(config=config)
112
+ self.encoder_rotary_emb = T5GemmaRotaryEmbedding(config=config)
113
+
114
+ @staticmethod
115
+ def _apply_rotary_with_progress(
116
+ projected_states: torch.Tensor,
117
+ base_states: torch.Tensor,
118
+ position_ids: Optional[torch.Tensor],
119
+ rotary_module: T5GemmaRotaryEmbedding,
120
+ ) -> torch.Tensor:
121
+ if position_ids is None:
122
+ return projected_states
123
+ cos, sin = rotary_module(base_states, position_ids)
124
+ cos = cos.unsqueeze(1).to(
125
+ dtype=projected_states.dtype, device=projected_states.device
126
+ )
127
+ sin = sin.unsqueeze(1).to(
128
+ dtype=projected_states.dtype, device=projected_states.device
129
+ )
130
+ return (projected_states * cos) + (rotate_half(projected_states) * sin)
131
+
132
+ def forward(
133
+ self,
134
+ hidden_states: torch.Tensor,
135
+ attention_mask: Optional[torch.Tensor],
136
+ encoder_hidden_states: Optional[torch.Tensor],
137
+ past_key_values: Optional[Cache] = None,
138
+ pm_decoder_position_ids: Optional[torch.Tensor] = None,
139
+ pm_encoder_position_ids: Optional[torch.Tensor] = None,
140
+ **kwargs: FlashAttentionKwargs,
141
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
142
+ if encoder_hidden_states is None:
143
+ raise ValueError("Encoder hidden state is required for cross attention.")
144
+
145
+ pm_decoder_position_ids = kwargs.pop(
146
+ "pm_decoder_position_ids", pm_decoder_position_ids
147
+ )
148
+ pm_encoder_position_ids = kwargs.pop(
149
+ "pm_encoder_position_ids", pm_encoder_position_ids
150
+ )
151
+
152
+ input_shape = hidden_states.shape[:-1]
153
+ hidden_shape = (*input_shape, -1, self.head_dim)
154
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
155
+ if pm_decoder_position_ids is not None:
156
+ query_states = self._apply_rotary_with_progress(
157
+ query_states,
158
+ hidden_states,
159
+ pm_decoder_position_ids,
160
+ self.decoder_rotary_emb,
161
+ )
162
+
163
+ if past_key_values is not None:
164
+ is_updated = past_key_values.is_updated.get(self.layer_idx)
165
+ curr_past_key_values = past_key_values.cross_attention_cache
166
+
167
+ if past_key_values is None or not is_updated:
168
+ encoder_input_shape = encoder_hidden_states.shape[:-1]
169
+ encoder_hidden_shape = (*encoder_input_shape, -1, self.head_dim)
170
+ key_states = (
171
+ self.k_proj(encoder_hidden_states)
172
+ .view(encoder_hidden_shape)
173
+ .transpose(1, 2)
174
+ )
175
+ if pm_encoder_position_ids is not None:
176
+ key_states = self._apply_rotary_with_progress(
177
+ key_states,
178
+ encoder_hidden_states,
179
+ pm_encoder_position_ids,
180
+ self.encoder_rotary_emb,
181
+ )
182
+ value_states = (
183
+ self.v_proj(encoder_hidden_states)
184
+ .view(encoder_hidden_shape)
185
+ .transpose(1, 2)
186
+ )
187
+
188
+ if past_key_values is not None:
189
+ key_states, value_states = curr_past_key_values.update(
190
+ key_states, value_states, self.layer_idx
191
+ )
192
+ past_key_values.is_updated[self.layer_idx] = True
193
+ else:
194
+ key_states = curr_past_key_values.layers[self.layer_idx].keys
195
+ value_states = curr_past_key_values.layers[self.layer_idx].values
196
+
197
+ attention_interface: Callable = eager_attention_forward
198
+ if self.config._attn_implementation != "eager":
199
+ attention_interface = ALL_ATTENTION_FUNCTIONS[
200
+ self.config._attn_implementation
201
+ ]
202
+
203
+ attn_output, attn_weights = attention_interface(
204
+ self,
205
+ query_states,
206
+ key_states,
207
+ value_states,
208
+ attention_mask,
209
+ dropout=self.attention_dropout if self.training else 0.0,
210
+ scaling=self.scaling,
211
+ sliding_window=None,
212
+ softcap=self.attn_logit_softcapping,
213
+ **kwargs,
214
+ )
215
+
216
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
217
+ attn_output = self.o_proj(attn_output)
218
+ return attn_output, attn_weights
219
+
220
+
221
+ class PMDecoderLayer(T5GemmaDecoderLayer):
222
+ """Decoder layer variant with PM-RoPE cross-attention built in."""
223
+
224
+ def __init__(self, config, layer_idx: int):
225
+ super().__init__(config, layer_idx)
226
+ self.cross_attn = PMCrossAttention(config=config, layer_idx=layer_idx)
227
+
228
+ def forward(
229
+ self,
230
+ hidden_states: torch.Tensor,
231
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
232
+ attention_mask: Optional[torch.Tensor] = None,
233
+ position_ids: Optional[torch.LongTensor] = None,
234
+ past_key_values: Optional[EncoderDecoderCache] = None,
235
+ use_cache: Optional[bool] = False,
236
+ cache_position: Optional[torch.LongTensor] = None,
237
+ encoder_hidden_states: Optional[torch.Tensor] = None,
238
+ encoder_attention_mask: Optional[torch.Tensor] = None,
239
+ pm_decoder_position_ids: Optional[torch.Tensor] = None,
240
+ pm_encoder_position_ids: Optional[torch.Tensor] = None,
241
+ **kwargs,
242
+ ) -> torch.FloatTensor:
243
+ pm_decoder_position_ids = kwargs.pop(
244
+ "pm_decoder_position_ids", pm_decoder_position_ids
245
+ )
246
+ pm_encoder_position_ids = kwargs.pop(
247
+ "pm_encoder_position_ids", pm_encoder_position_ids
248
+ )
249
+
250
+ residual = hidden_states
251
+ hidden_states = self.pre_self_attn_layernorm(hidden_states)
252
+ hidden_states, _ = self.self_attn(
253
+ hidden_states=hidden_states,
254
+ position_embeddings=position_embeddings,
255
+ attention_mask=attention_mask,
256
+ position_ids=position_ids,
257
+ past_key_values=(
258
+ past_key_values.self_attention_cache
259
+ if past_key_values is not None
260
+ else None
261
+ ),
262
+ use_cache=use_cache,
263
+ cache_position=cache_position,
264
+ **kwargs,
265
+ )
266
+ hidden_states = self.post_self_attn_layernorm(hidden_states)
267
+ hidden_states = residual + self.dropout(hidden_states)
268
+
269
+ residual = hidden_states
270
+ hidden_states = self.pre_cross_attn_layernorm(hidden_states)
271
+ hidden_states, _ = self.cross_attn(
272
+ hidden_states=hidden_states,
273
+ encoder_hidden_states=encoder_hidden_states,
274
+ attention_mask=encoder_attention_mask,
275
+ past_key_values=past_key_values,
276
+ pm_decoder_position_ids=pm_decoder_position_ids,
277
+ pm_encoder_position_ids=pm_encoder_position_ids,
278
+ **kwargs,
279
+ )
280
+ hidden_states = self.post_cross_attn_layernorm(hidden_states)
281
+ hidden_states = residual + self.dropout(hidden_states)
282
+
283
+ residual = hidden_states
284
+ hidden_states = self.pre_feedforward_layernorm(hidden_states)
285
+ hidden_states = self.mlp(hidden_states)
286
+ hidden_states = self.post_feedforward_layernorm(hidden_states)
287
+ hidden_states = residual + self.dropout(hidden_states)
288
+ return hidden_states
289
+
290
+
291
+ def _make_args_from_config(config: T5GemmaVoiceConfig):
292
+ """Namespace-like shim; keeps attribute access identical to training code."""
293
+
294
+ class _Obj:
295
+ pass
296
+
297
+ o = _Obj()
298
+ for k, v in config.to_dict().items():
299
+ setattr(o, k, v)
300
+ return o
301
+
302
+
303
+ class T5GemmaVoiceForConditionalGeneration(PreTrainedModel, GenerationMixin):
304
+ config_class = T5GemmaVoiceConfig
305
+ base_model_prefix = "backbone"
306
+ _keys_to_ignore_on_save = ["encoder_module", "decoder_module"]
307
+
308
+ def __init__(self, config: T5GemmaVoiceConfig):
309
+ super().__init__(config)
310
+ # keep compatibility with original code that expects self.args
311
+ self.args = _make_args_from_config(config)
312
+ if getattr(self.args, "n_codebooks", 1) != 1:
313
+ logging.info("Resetting n_codebooks to 1 for XCodec2 backend.")
314
+ self.args.n_codebooks = 1
315
+
316
+ logging.info(f"Loading T5Gemma backbone: {self.args.t5gemma_model_name}")
317
+ precision = getattr(self.args, "precision", "float32")
318
+ if precision == "float16":
319
+ dtype = torch.float16
320
+ elif precision == "bfloat16":
321
+ dtype = torch.bfloat16
322
+ else:
323
+ dtype = torch.float32
324
+
325
+ # instantiate backbone from config to avoid weight download during load
326
+ if config.t5_config_dict is not None:
327
+ from transformers.models.t5gemma import T5GemmaConfig
328
+
329
+ base_cfg = T5GemmaConfig(**config.t5_config_dict)
330
+ base_cfg._attn_implementation = getattr(
331
+ self.args, "attn_implementation", "eager"
332
+ )
333
+ # Force bf16/specified dtype initialization and disable all tying.
334
+ base_cfg.tie_word_embeddings = False
335
+ base_cfg.tie_input_output_embeddings = False
336
+ base_cfg.tie_encoder_decoder = False
337
+ if hasattr(base_cfg, "encoder"):
338
+ base_cfg.encoder.tie_word_embeddings = False
339
+ base_cfg.encoder.tie_input_output_embeddings = False
340
+ base_cfg.encoder.tie_encoder_decoder = False
341
+ if hasattr(base_cfg, "decoder"):
342
+ base_cfg.decoder.tie_word_embeddings = False
343
+ base_cfg.decoder.tie_input_output_embeddings = False
344
+ base_cfg.decoder.tie_encoder_decoder = False
345
+ self.backbone = AutoModelForSeq2SeqLM.from_config(
346
+ base_cfg, torch_dtype=dtype
347
+ )
348
+ else:
349
+ self.backbone = AutoModelForSeq2SeqLM.from_pretrained(
350
+ self.args.t5gemma_model_name,
351
+ attn_implementation=getattr(self.args, "attn_implementation", "eager"),
352
+ torch_dtype=dtype,
353
+ )
354
+
355
+ prune_text_modules = getattr(self.args, "prune_text_modules", 0)
356
+ drop_lm_head = prune_text_modules >= 1
357
+ drop_decoder_embed = prune_text_modules >= 2
358
+
359
+ if drop_lm_head and hasattr(self.backbone, "lm_head"):
360
+ del self.backbone.lm_head
361
+ self.backbone.lm_head = nn.Identity()
362
+ if hasattr(self.backbone.config, "tie_word_embeddings"):
363
+ self.backbone.config.tie_word_embeddings = False
364
+ logging.info("lm_head removed (prune_text_modules=%d)", prune_text_modules)
365
+
366
+ if drop_decoder_embed:
367
+ decoder = getattr(
368
+ self.backbone, "model", getattr(self.backbone, "decoder", None)
369
+ )
370
+ decoder = getattr(decoder, "decoder", decoder)
371
+ if decoder is not None and hasattr(decoder, "embed_tokens"):
372
+ del decoder.embed_tokens
373
+ decoder.embed_tokens = nn.Identity()
374
+ if hasattr(self.backbone.config, "tie_word_embeddings"):
375
+ self.backbone.config.tie_word_embeddings = False
376
+ logging.info(
377
+ "decoder.embed_tokens removed (prune_text_modules=%d)",
378
+ prune_text_modules,
379
+ )
380
+
381
+ # This wrapper is inference-only, so keep cache enabled.
382
+ self.backbone.config.use_cache = True
383
+
384
+ if hasattr(self.backbone, "model"):
385
+ self.encoder_module = self.backbone.model.encoder
386
+ self.decoder_module = self.backbone.model.decoder
387
+ else:
388
+ self.encoder_module = getattr(self.backbone, "encoder", None)
389
+ self.decoder_module = getattr(self.backbone, "decoder", None)
390
+ if self.encoder_module is None or self.decoder_module is None:
391
+ raise AttributeError(
392
+ "Failed to locate encoder/decoder modules on T5Gemma backbone."
393
+ )
394
+
395
+ config_hidden_size = getattr(self.backbone.config, "d_model", None)
396
+ if config_hidden_size is None:
397
+ config_hidden_size = getattr(self.backbone.config, "hidden_size", None)
398
+ if config_hidden_size is None:
399
+ enc = getattr(self.backbone.config, "encoder", None)
400
+ if enc is not None:
401
+ config_hidden_size = getattr(enc, "hidden_size", None)
402
+ if config_hidden_size is None:
403
+ raise AttributeError("T5Gemma config does not expose d_model/hidden_size.")
404
+
405
+ self.hidden_size = config_hidden_size
406
+ self.args.audio_embedding_dim = getattr(
407
+ self.args, "audio_embedding_dim", self.hidden_size
408
+ )
409
+
410
+ self._enable_pm_rope_cross_attention()
411
+
412
+ self.text_input_type = "text" # fixed
413
+ self.text_embedding = None
414
+ self.text_dropout = nn.Identity()
415
+
416
+ if isinstance(self.args.audio_vocab_size, list):
417
+ audio_vocab_sizes = [
418
+ size + self.args.n_special for size in self.args.audio_vocab_size
419
+ ]
420
+ else:
421
+ audio_vocab_sizes = [
422
+ self.args.audio_vocab_size + self.args.n_special
423
+ ] * self.args.n_codebooks
424
+ self.n_audio_tokens = audio_vocab_sizes
425
+
426
+ self.audio_embedding = nn.ModuleList(
427
+ [
428
+ nn.Embedding(audio_vocab_sizes[k], self.hidden_size)
429
+ for k in range(self.args.n_codebooks)
430
+ ]
431
+ )
432
+ self.audio_dropout = nn.Dropout(0.0)
433
+
434
+ self.predict_layer = nn.ModuleList(
435
+ [
436
+ nn.Sequential(
437
+ nn.Linear(self.hidden_size, self.hidden_size),
438
+ nn.GELU(),
439
+ nn.Linear(self.hidden_size, audio_vocab_sizes[k]),
440
+ )
441
+ for k in range(self.args.n_codebooks)
442
+ ]
443
+ )
444
+ self.progress_scale = getattr(self.args, "progress_scale", 2000.0)
445
+
446
+ def get_output_embeddings(self):
447
+ return None
448
+
449
+ def set_output_embeddings(self, new_embeddings):
450
+ raise NotImplementedError("Output embeddings are pruned in this model.")
451
+
452
+ # avoid transformers default tying logic (lm_head is removed)
453
+ def tie_weights(self):
454
+ return
455
+
456
+ def get_encoder(self):
457
+ return self.encoder_module
458
+
459
+ def get_decoder(self):
460
+ return self.decoder_module
461
+
462
+ def state_dict(self, *args, **kwargs): # pragma: no cover - save hook
463
+ sd = super().state_dict(*args, **kwargs)
464
+ drop_keys = [
465
+ k
466
+ for k in sd
467
+ if k.startswith("encoder_module.") or k.startswith("decoder_module.")
468
+ ]
469
+ for k in drop_keys:
470
+ sd.pop(k)
471
+ return sd
472
+
473
+ def _progress_positions_single(self, length: int, device) -> torch.Tensor:
474
+ if length <= 0:
475
+ return torch.zeros(0, device=device, dtype=torch.float32)
476
+ if length == 1:
477
+ return torch.zeros(1, device=device, dtype=torch.float32)
478
+ base = torch.arange(length, device=device, dtype=torch.float32)
479
+ return base / (length - 1) * self.progress_scale
480
+
481
+ def _build_position_ids(
482
+ self, lengths: torch.Tensor, max_len: int, device
483
+ ) -> torch.Tensor:
484
+ # Vectorized implementation: avoid Python loop over batch dimension.
485
+ # Ensure lengths is on the correct device to prevent device mismatch.
486
+ lengths = lengths.to(device=device)
487
+ pos = torch.arange(max_len, device=device, dtype=torch.float32)[None, :] # [1, T]
488
+
489
+ # Clamp denominator to avoid division by zero for length <= 1.
490
+ # For length 0 or 1, result will be masked to zero anyway.
491
+ denom = (lengths.clamp(min=2).to(torch.float32) - 1.0)[:, None] # [B, 1]
492
+ position_ids = pos / denom * self.progress_scale # [B, T]
493
+
494
+ # Mask out positions beyond each sequence's length.
495
+ mask = pos < lengths[:, None] # [B, T] (bool)
496
+ return position_ids.masked_fill(~mask, 0.0)
497
+
498
+ def _enable_pm_rope_cross_attention(self) -> None:
499
+ if getattr(self, "_pm_rope_enabled", False):
500
+ return
501
+ if not getattr(self.args, "use_pm_rope", 1):
502
+ logging.info("PM-RoPE cross-attention disabled by config.")
503
+ return
504
+ decoder_layers = getattr(self.decoder_module, "layers", None)
505
+ if decoder_layers is None:
506
+ logging.warning(
507
+ "Decoder module does not expose layers attribute; skipping PM-RoPE injection."
508
+ )
509
+ return
510
+
511
+ new_layers = nn.ModuleList()
512
+ for layer in decoder_layers:
513
+ pm_layer = PMDecoderLayer(layer.config, layer.layer_idx)
514
+ pm_layer.load_state_dict(layer.state_dict(), strict=False)
515
+ pm_layer.gradient_checkpointing = getattr(
516
+ layer, "gradient_checkpointing", False
517
+ )
518
+ if hasattr(layer, "_gradient_checkpointing_func"):
519
+ pm_layer._gradient_checkpointing_func = (
520
+ layer._gradient_checkpointing_func
521
+ )
522
+ new_layers.append(pm_layer)
523
+ self.decoder_module.layers = new_layers
524
+ self._pm_rope_enabled = True
525
+ logging.info(
526
+ "PM-RoPE cross-attention enabled for %d decoder layers.", len(new_layers)
527
+ )
528
+
529
+ # Generation-style inference with batch support for multiple samples
530
+ @torch.inference_mode()
531
+ def inference_tts(
532
+ self,
533
+ x: torch.Tensor,
534
+ x_lens: torch.Tensor,
535
+ y: torch.Tensor,
536
+ tgt_y_lens: torch.Tensor,
537
+ top_k: Union[int, List[int]] = -100,
538
+ top_p: float = 1.0,
539
+ min_p: float = 0.0,
540
+ temperature: float = 1.0,
541
+ stop_repetition: int = 3,
542
+ silence_tokens: List[int] = None,
543
+ multi_trial: List[int] = None,
544
+ num_samples: int = 1,
545
+ **kwargs,
546
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
547
+ """
548
+ Run TTS inference.
549
+
550
+ Args:
551
+ num_samples: Number of samples to generate in parallel with different
552
+ random seeds. Input x and y are duplicated internally.
553
+
554
+ Returns:
555
+ Tuple of (concat_frames, gen_frames) with shape [num_samples, 1, T].
556
+ """
557
+ if getattr(self.args, "n_codebooks", 1) != 1:
558
+ raise ValueError("XCodec2 inference expects n_codebooks=1.")
559
+
560
+ self.backbone.config.use_cache = True
561
+ if multi_trial:
562
+ logging.warning("multi_trial is unsupported and will be ignored.")
563
+ silence_tokens = silence_tokens or []
564
+
565
+ device = x.device
566
+ eog_inference = (
567
+ self.args.eos if getattr(self.args, "eos", -1) > 0 else self.args.eog
568
+ )
569
+
570
+ # Input validation: expect batch_size=1, then expand to num_samples
571
+ assert x.shape[0] == 1, "Input batch size must be 1; use num_samples for parallel generation."
572
+ batch_size = num_samples
573
+
574
+ # Encoder runs once (same input for all samples)
575
+ x_padding_mask = make_pad_mask(x_lens).to(device)
576
+ encoder_attention_mask = (~x_padding_mask).long()
577
+ if getattr(self.args, "use_pm_rope", 1):
578
+ encoder_position_ids = self._build_position_ids(x_lens, x.shape[1], device)
579
+ else:
580
+ encoder_position_ids = None
581
+ if self.text_input_type == "text":
582
+ encoder_outputs = self.encoder_module(
583
+ input_ids=x,
584
+ attention_mask=encoder_attention_mask,
585
+ position_ids=encoder_position_ids,
586
+ )
587
+ else:
588
+ x_embeds = self.text_dropout(self.text_embedding(x))
589
+ encoder_outputs = self.encoder_module(
590
+ inputs_embeds=x_embeds,
591
+ attention_mask=encoder_attention_mask,
592
+ position_ids=encoder_position_ids,
593
+ )
594
+ memory = encoder_outputs.last_hidden_state # [1, T_enc, D]
595
+
596
+ # Expand encoder outputs for batch
597
+ if batch_size > 1:
598
+ memory = memory.expand(batch_size, -1, -1).contiguous()
599
+ encoder_attention_mask = encoder_attention_mask.expand(batch_size, -1).contiguous()
600
+ if encoder_position_ids is not None:
601
+ encoder_position_ids = encoder_position_ids.expand(batch_size, -1).contiguous()
602
+
603
+ if self.args.special_first:
604
+ y = y + int(self.args.n_special)
605
+ y = y.transpose(2, 1).contiguous() # [1, 1, T]
606
+ y_len = y.shape[-1]
607
+ prompt_frames = kwargs.get("prompt_frames", y_len)
608
+
609
+ # Expand y for batch
610
+ if batch_size > 1:
611
+ y = y.expand(batch_size, -1, -1).contiguous()
612
+
613
+ target_total = None
614
+ cutoff_limit = None
615
+ if tgt_y_lens is not None:
616
+ target_total = int(tgt_y_lens[0].item())
617
+ extra_cutoff = getattr(self.args, "extra_cutoff", 5.0)
618
+ codec_sr = int(getattr(self.args, "encodec_sr", 50))
619
+ cutoff_limit = target_total + int(codec_sr * extra_cutoff)
620
+
621
+ bos = torch.full(
622
+ (batch_size, 1, 1),
623
+ self.args.empty_token,
624
+ dtype=torch.long,
625
+ device=device,
626
+ )
627
+ cated_y = torch.cat([bos, y], dim=2)
628
+
629
+ new_y_len_value = cated_y.shape[-1]
630
+ new_y_lens = torch.full(
631
+ (batch_size,), new_y_len_value, dtype=torch.long, device=device
632
+ )
633
+ embedded_y = self.audio_embedding[0](cated_y[:, 0])
634
+ embedded_y = self.audio_dropout(embedded_y)
635
+
636
+ y_padding_mask = torch.full(
637
+ (batch_size, embedded_y.shape[1]), False, device=device
638
+ )
639
+ current_length = embedded_y.shape[1]
640
+ prompt_offset = prompt_frames + 1 # +BOS
641
+
642
+ decoder_attention_mask = (~y_padding_mask).long()
643
+
644
+ if target_total is not None:
645
+ est_total = int(target_total) + 1
646
+ elif cutoff_limit is not None:
647
+ est_total = int(cutoff_limit)
648
+ else:
649
+ lookahead = getattr(self.args, "progress_lookahead_secs", 2.0)
650
+ est_total = int(current_length + int(self.args.encodec_sr) * lookahead)
651
+ est_total = max(est_total, current_length)
652
+
653
+ # Pre-allocate attention mask buffer to avoid per-step tensor creation.
654
+ max_gen_length = est_total + int(getattr(self.args, "encodec_sr", 50) * 10)
655
+ full_dec_attention_mask = torch.ones(
656
+ (batch_size, max_gen_length), dtype=torch.long, device=device
657
+ )
658
+
659
+ cur_len = embedded_y.shape[1]
660
+ pm_kwargs = {}
661
+ decoder_position_ids_full = None
662
+ if getattr(self.args, "use_pm_rope", 1):
663
+ base = torch.arange(cur_len, device=device, dtype=torch.float32).unsqueeze(0)
664
+ decoder_position_ids_full = (
665
+ base / max(1, est_total - 1) * self.progress_scale
666
+ )
667
+ if batch_size > 1:
668
+ decoder_position_ids_full = decoder_position_ids_full.expand(batch_size, -1).contiguous()
669
+ pm_kwargs["position_ids"] = decoder_position_ids_full
670
+ pm_kwargs["pm_decoder_position_ids"] = decoder_position_ids_full
671
+ pm_kwargs["pm_encoder_position_ids"] = encoder_position_ids
672
+ else:
673
+ pm_kwargs["position_ids"] = None
674
+
675
+ decoder_outputs = self.decoder_module(
676
+ inputs_embeds=embedded_y,
677
+ attention_mask=decoder_attention_mask,
678
+ encoder_hidden_states=memory,
679
+ encoder_attention_mask=encoder_attention_mask,
680
+ use_cache=True,
681
+ **pm_kwargs,
682
+ )
683
+ last_hidden = decoder_outputs.last_hidden_state[:, -1:, :] # [B, 1, D]
684
+ past_key_values = decoder_outputs.past_key_values
685
+
686
+ # Batch generation state
687
+ generated_tokens: List[torch.Tensor] = [] # List of [B] tensors
688
+ cur_num_gen = 0
689
+ prev_tokens = torch.full((batch_size,), -1, dtype=torch.long, device=device)
690
+ consec_silence_counts = torch.zeros(batch_size, dtype=torch.long, device=device)
691
+ finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
692
+ silence_set = set(silence_tokens)
693
+
694
+ # Compute budgets once
695
+ first_input_len = int(x_lens[0].item())
696
+ text_mode = getattr(self.args, "text_input_type", "text") == "text"
697
+ frames_per_token_cap = getattr(self.args, "text_guard_frames_per_token", 0)
698
+ extra_cutoff_val = getattr(self.args, "extra_cutoff", 5)
699
+
700
+ while not finished.all():
701
+ logits = self.predict_layer[0](last_hidden).squeeze(1) # [B, V]
702
+
703
+ effective_length = max(0, current_length - prompt_offset)
704
+
705
+ # Adjust logits for all samples
706
+ if effective_length == 0:
707
+ logits[:, eog_inference] = -1e9
708
+
709
+ if isinstance(top_k, list):
710
+ kk = top_k[min(len(top_k) - 1, cur_num_gen)]
711
+ else:
712
+ kk = top_k
713
+
714
+ if cur_num_gen <= self.args.encodec_sr // 5:
715
+ logits[:, eog_inference] = -10000.0
716
+
717
+ # Stop repetition penalty (vectorized)
718
+ if stop_repetition > 0 and silence_tokens:
719
+ for sil_tok in silence_tokens:
720
+ mask = (prev_tokens == sil_tok) & (consec_silence_counts > stop_repetition)
721
+ if mask.any():
722
+ penalty = (consec_silence_counts[mask] - (stop_repetition - 1)).float()
723
+ neg_mask = logits[mask, sil_tok] < 0
724
+ logits[mask, sil_tok] = torch.where(
725
+ neg_mask,
726
+ logits[mask, sil_tok] * penalty,
727
+ logits[mask, sil_tok] / penalty,
728
+ )
729
+
730
+ # Sample tokens for all batch elements
731
+ tokens = topk_sampling(
732
+ logits,
733
+ top_k=kk,
734
+ top_p=top_p,
735
+ min_p=min_p,
736
+ temperature=temperature,
737
+ ).squeeze(-1) # [B]
738
+
739
+ # Force stop conditions
740
+ should_force_stop = (tokens == eog_inference) | (logits.argmax(dim=-1) == eog_inference)
741
+
742
+ if not text_mode:
743
+ token_budget = first_input_len * max(1, int(self.args.encodec_sr) // 4)
744
+ should_force_stop |= (effective_length > token_budget)
745
+ elif frames_per_token_cap > 0:
746
+ token_budget = max(1, first_input_len) * frames_per_token_cap
747
+ should_force_stop |= (effective_length > token_budget)
748
+
749
+ if target_total is not None:
750
+ time_budget = target_total - prompt_offset + int(self.args.encodec_sr) * extra_cutoff_val
751
+ if cur_num_gen > time_budget:
752
+ should_force_stop[:] = True
753
+
754
+ # Apply force stop
755
+ tokens = torch.where(should_force_stop, torch.full_like(tokens, eog_inference), tokens)
756
+
757
+ # Update silence tracking
758
+ for sil_tok in silence_tokens:
759
+ is_same_silence = (tokens == sil_tok) & (prev_tokens == sil_tok)
760
+ consec_silence_counts = torch.where(
761
+ is_same_silence,
762
+ consec_silence_counts + 1,
763
+ torch.where(tokens == sil_tok, torch.ones_like(consec_silence_counts), torch.zeros_like(consec_silence_counts))
764
+ )
765
+
766
+ prev_tokens = tokens.clone()
767
+
768
+ # Mark finished samples
769
+ newly_finished = tokens == eog_inference
770
+ finished |= newly_finished
771
+
772
+ # Store tokens (use EOG for already-finished samples)
773
+ store_tokens = torch.where(finished & ~newly_finished, torch.full_like(tokens, eog_inference), tokens)
774
+ generated_tokens.append(store_tokens)
775
+
776
+ cur_num_gen += 1
777
+ current_length += 1
778
+
779
+ if finished.all():
780
+ break
781
+
782
+ # Embed next tokens
783
+ samples_emb = self.audio_embedding[0](tokens.unsqueeze(1)) # [B, 1, D]
784
+ samples_emb = self.audio_dropout(samples_emb)
785
+
786
+ if getattr(self.args, "use_pm_rope", 1):
787
+ new_pos_value = (
788
+ float(current_length - 1) / max(1, est_total - 1) * self.progress_scale
789
+ )
790
+ new_pos_value = min(new_pos_value, self.progress_scale)
791
+ pos_1 = torch.full(
792
+ (batch_size, 1), new_pos_value, device=device, dtype=torch.float32
793
+ )
794
+ pm_kwargs = {
795
+ "position_ids": pos_1,
796
+ "pm_decoder_position_ids": pos_1,
797
+ "pm_encoder_position_ids": encoder_position_ids,
798
+ }
799
+ else:
800
+ pm_kwargs = {"position_ids": None}
801
+
802
+ decoder_outputs = self.decoder_module(
803
+ inputs_embeds=samples_emb,
804
+ attention_mask=full_dec_attention_mask[:, :current_length],
805
+ encoder_hidden_states=memory,
806
+ encoder_attention_mask=encoder_attention_mask,
807
+ past_key_values=past_key_values,
808
+ use_cache=True,
809
+ **pm_kwargs,
810
+ )
811
+ past_key_values = decoder_outputs.past_key_values
812
+ last_hidden = decoder_outputs.last_hidden_state
813
+
814
+ # Stack generated tokens: [B, T_gen]
815
+ if generated_tokens:
816
+ generated_tensor = torch.stack(generated_tokens, dim=1)
817
+ else:
818
+ generated_tensor = torch.zeros((batch_size, 0), device=device, dtype=torch.long)
819
+
820
+ # Trim each sample to its actual length (up to first EOG)
821
+ # For simplicity, keep rectangular tensor but mask with EOG
822
+ # The caller can trim per-sample if needed
823
+
824
+ # Build result tensors
825
+ # y is [B, 1, T_prompt], generated_tensor is [B, T_gen]
826
+ expected_y_len = y_len + generated_tensor.shape[1]
827
+ res = torch.cat([y[:, 0, :], generated_tensor], dim=1).unsqueeze(1) # [B, 1, T_total]
828
+
829
+ if self.args.special_first:
830
+ res = res - int(self.args.n_special)
831
+ generated_tensor = generated_tensor - int(self.args.n_special)
832
+
833
+ return res, generated_tensor.unsqueeze(1) # [B, 1, T_gen]
aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d7ad71f5d7ddad9b720aeed8239c802ca4cb6ff86ddf6f419167d1b7e6b38bf
3
+ size 483244
aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample2_fast.wav ADDED
Binary file (96 kB). View file
 
aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample2_normal.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ebcf8e90d05a9938baa744545c833a586e62754b1250515c4f3454327fd1814
3
+ size 160044
aratako_tts/T5Gemma-TTS-2b-2b/samples/en_sample2_slow.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6ce0dfbecb9bdb0134fcf0a4eb153e4fd4254ad7fd440f568f0cb7e071d140b
3
+ size 224044
aratako_tts/T5Gemma-TTS-2b-2b/samples/gen_sample1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c508e86225ceb26596686513b6c76dc418678127a4f3f844426b5c26ebfb2277
3
+ size 793844
aratako_tts/T5Gemma-TTS-2b-2b/samples/gen_sample2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ba6861440144a36f2bb397bc2eb4d324f8601cd1a470aca4651f8cb080f26eb
3
+ size 908504
aratako_tts/T5Gemma-TTS-2b-2b/samples/gen_sample3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a4b67fe6df5f26196a05b09eaea8558a76612f44032245267f3af03466980be
3
+ size 274604
aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f708ca7bbae12c23e74aba7f75f6f0a227cbeed043d9d38853341df15935f9b5
3
+ size 1497680
aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample2_fast.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1d2fdd668b1fa84214421632b38cdb469d1047a8f3086dfc00651b8cc6acc77
3
+ size 264644
aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample2_normal.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e07c1af9be686390d79857af99b12fed4eebfbd0c8fc127efc1bc005449f2533
3
+ size 441044
aratako_tts/T5Gemma-TTS-2b-2b/samples/jp_sample2_slow.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef76b01620d62119ed3a9cd0a0b39d13dccbc7bfd27b418b70e274132e32ea6e
3
+ size 617444
aratako_tts/T5Gemma-TTS-2b-2b/samples/ref_sample1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:636767b965a9e9d194f939bc99820adbea9896c02e946cb45a735e77eb4fda8e
3
+ size 338732
aratako_tts/T5Gemma-TTS-2b-2b/samples/ref_sample2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5b82f0176c1acddad13a34a91ff5941c2c63bf596e92d364df0ac325634deff
3
+ size 292868
aratako_tts/T5Gemma-TTS-2b-2b/samples/ref_sample3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e44f987ddc1edc226f9f33dd173440fb8d8729933c74cad200ff20808d5c966e
3
+ size 269370
aratako_tts/T5Gemma-TTS-2b-2b/samples/zh_sample1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c5ebba5135dd60f5067b4936caf3ed8d2d2791aaa207b9e1f9a8b572c5621d0
3
+ size 627244
aratako_tts/t5gemma-tokenizer/config.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "T5GemmaForConditionalGeneration"
4
+ ],
5
+ "classifier_dropout_rate": 0.0,
6
+ "decoder": {
7
+ "attention_bias": false,
8
+ "attention_dropout": 0.0,
9
+ "attn_logit_softcapping": 50.0,
10
+ "classifier_dropout_rate": 0.0,
11
+ "cross_attention_hidden_size": 2304,
12
+ "dropout_rate": 0.0,
13
+ "final_logit_softcapping": 30.0,
14
+ "head_dim": 256,
15
+ "hidden_activation": "gelu_pytorch_tanh",
16
+ "hidden_size": 2304,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 9216,
19
+ "is_decoder": true,
20
+ "layer_types": [
21
+ "sliding_attention",
22
+ "full_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "full_attention",
27
+ "sliding_attention",
28
+ "full_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "full_attention",
33
+ "sliding_attention",
34
+ "full_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "full_attention",
39
+ "sliding_attention",
40
+ "full_attention",
41
+ "sliding_attention",
42
+ "full_attention",
43
+ "sliding_attention",
44
+ "full_attention",
45
+ "sliding_attention",
46
+ "full_attention"
47
+ ],
48
+ "max_position_embeddings": 8192,
49
+ "model_type": "t5_gemma_module",
50
+ "num_attention_heads": 8,
51
+ "num_hidden_layers": 26,
52
+ "num_key_value_heads": 4,
53
+ "query_pre_attn_scalar": 256,
54
+ "rms_norm_eps": 1e-06,
55
+ "rope_theta": 10000.0,
56
+ "sliding_window": 4096,
57
+ "torch_dtype": "bfloat16",
58
+ "use_cache": true,
59
+ "vocab_size": 256000
60
+ },
61
+ "dropout_rate": 0.0,
62
+ "encoder": {
63
+ "attention_bias": false,
64
+ "attention_dropout": 0.0,
65
+ "attn_logit_softcapping": 50.0,
66
+ "classifier_dropout_rate": 0.0,
67
+ "dropout_rate": 0.0,
68
+ "final_logit_softcapping": 30.0,
69
+ "head_dim": 256,
70
+ "hidden_activation": "gelu_pytorch_tanh",
71
+ "hidden_size": 2304,
72
+ "initializer_range": 0.02,
73
+ "intermediate_size": 9216,
74
+ "layer_types": [
75
+ "sliding_attention",
76
+ "full_attention",
77
+ "sliding_attention",
78
+ "full_attention",
79
+ "sliding_attention",
80
+ "full_attention",
81
+ "sliding_attention",
82
+ "full_attention",
83
+ "sliding_attention",
84
+ "full_attention",
85
+ "sliding_attention",
86
+ "full_attention",
87
+ "sliding_attention",
88
+ "full_attention",
89
+ "sliding_attention",
90
+ "full_attention",
91
+ "sliding_attention",
92
+ "full_attention",
93
+ "sliding_attention",
94
+ "full_attention",
95
+ "sliding_attention",
96
+ "full_attention",
97
+ "sliding_attention",
98
+ "full_attention",
99
+ "sliding_attention",
100
+ "full_attention"
101
+ ],
102
+ "max_position_embeddings": 8192,
103
+ "model_type": "t5_gemma_module",
104
+ "num_attention_heads": 8,
105
+ "num_hidden_layers": 26,
106
+ "num_key_value_heads": 4,
107
+ "query_pre_attn_scalar": 256,
108
+ "rms_norm_eps": 1e-06,
109
+ "rope_theta": 10000.0,
110
+ "sliding_window": 4096,
111
+ "torch_dtype": "bfloat16",
112
+ "use_cache": true,
113
+ "vocab_size": 256000
114
+ },
115
+ "eos_token_id": [
116
+ 1,
117
+ 107
118
+ ],
119
+ "initializer_range": 0.02,
120
+ "is_encoder_decoder": true,
121
+ "model_type": "t5gemma",
122
+ "pad_token_id": 0,
123
+ "torch_dtype": "bfloat16",
124
+ "transformers_version": "4.53.0.dev0",
125
+ "use_cache": true
126
+ }
aratako_tts/t5gemma-tokenizer/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": [
5
+ 1,
6
+ 107
7
+ ],
8
+ "pad_token_id": 0,
9
+ "transformers_version": "4.53.0.dev0"
10
+ }
aratako_tts/t5gemma-tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<start_of_turn>",
4
+ "<end_of_turn>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<bos>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<pad>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
aratako_tts/t5gemma-tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7794135caa3ea73918949c902a781cc61dab674a4b59c17d85931c77c1114cbd
3
+ size 34362429
aratako_tts/t5gemma-tokenizer/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
3
+ size 4241003
aratako_tts/t5gemma-tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,2014 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<mask>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "5": {
46
+ "content": "<2mass>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "6": {
54
+ "content": "[@BOS@]",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "7": {
62
+ "content": "<unused0>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "8": {
70
+ "content": "<unused1>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "9": {
78
+ "content": "<unused2>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "10": {
86
+ "content": "<unused3>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "11": {
94
+ "content": "<unused4>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "12": {
102
+ "content": "<unused5>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "13": {
110
+ "content": "<unused6>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "14": {
118
+ "content": "<unused7>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "15": {
126
+ "content": "<unused8>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "16": {
134
+ "content": "<unused9>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "17": {
142
+ "content": "<unused10>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "18": {
150
+ "content": "<unused11>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "19": {
158
+ "content": "<unused12>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "20": {
166
+ "content": "<unused13>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "21": {
174
+ "content": "<unused14>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "22": {
182
+ "content": "<unused15>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "23": {
190
+ "content": "<unused16>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "24": {
198
+ "content": "<unused17>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "25": {
206
+ "content": "<unused18>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "26": {
214
+ "content": "<unused19>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "27": {
222
+ "content": "<unused20>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "28": {
230
+ "content": "<unused21>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "29": {
238
+ "content": "<unused22>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "30": {
246
+ "content": "<unused23>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "31": {
254
+ "content": "<unused24>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "32": {
262
+ "content": "<unused25>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "33": {
270
+ "content": "<unused26>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "34": {
278
+ "content": "<unused27>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "35": {
286
+ "content": "<unused28>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "36": {
294
+ "content": "<unused29>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "37": {
302
+ "content": "<unused30>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "38": {
310
+ "content": "<unused31>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ },
317
+ "39": {
318
+ "content": "<unused32>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": false
324
+ },
325
+ "40": {
326
+ "content": "<unused33>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": false
332
+ },
333
+ "41": {
334
+ "content": "<unused34>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": false
340
+ },
341
+ "42": {
342
+ "content": "<unused35>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": false
348
+ },
349
+ "43": {
350
+ "content": "<unused36>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": false
356
+ },
357
+ "44": {
358
+ "content": "<unused37>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": false
364
+ },
365
+ "45": {
366
+ "content": "<unused38>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": false
372
+ },
373
+ "46": {
374
+ "content": "<unused39>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": false
380
+ },
381
+ "47": {
382
+ "content": "<unused40>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": false
388
+ },
389
+ "48": {
390
+ "content": "<unused41>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": false
396
+ },
397
+ "49": {
398
+ "content": "<unused42>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": false
404
+ },
405
+ "50": {
406
+ "content": "<unused43>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": false
412
+ },
413
+ "51": {
414
+ "content": "<unused44>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": false
420
+ },
421
+ "52": {
422
+ "content": "<unused45>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": false
428
+ },
429
+ "53": {
430
+ "content": "<unused46>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": false
436
+ },
437
+ "54": {
438
+ "content": "<unused47>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": false
444
+ },
445
+ "55": {
446
+ "content": "<unused48>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": false
452
+ },
453
+ "56": {
454
+ "content": "<unused49>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": false
460
+ },
461
+ "57": {
462
+ "content": "<unused50>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": false
468
+ },
469
+ "58": {
470
+ "content": "<unused51>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": false
476
+ },
477
+ "59": {
478
+ "content": "<unused52>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": false
484
+ },
485
+ "60": {
486
+ "content": "<unused53>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": false
492
+ },
493
+ "61": {
494
+ "content": "<unused54>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": false
500
+ },
501
+ "62": {
502
+ "content": "<unused55>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": false
508
+ },
509
+ "63": {
510
+ "content": "<unused56>",
511
+ "lstrip": false,
512
+ "normalized": false,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": false
516
+ },
517
+ "64": {
518
+ "content": "<unused57>",
519
+ "lstrip": false,
520
+ "normalized": false,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": false
524
+ },
525
+ "65": {
526
+ "content": "<unused58>",
527
+ "lstrip": false,
528
+ "normalized": false,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": false
532
+ },
533
+ "66": {
534
+ "content": "<unused59>",
535
+ "lstrip": false,
536
+ "normalized": false,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": false
540
+ },
541
+ "67": {
542
+ "content": "<unused60>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": false
548
+ },
549
+ "68": {
550
+ "content": "<unused61>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": false
556
+ },
557
+ "69": {
558
+ "content": "<unused62>",
559
+ "lstrip": false,
560
+ "normalized": false,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": false
564
+ },
565
+ "70": {
566
+ "content": "<unused63>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": false
572
+ },
573
+ "71": {
574
+ "content": "<unused64>",
575
+ "lstrip": false,
576
+ "normalized": false,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": false
580
+ },
581
+ "72": {
582
+ "content": "<unused65>",
583
+ "lstrip": false,
584
+ "normalized": false,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": false
588
+ },
589
+ "73": {
590
+ "content": "<unused66>",
591
+ "lstrip": false,
592
+ "normalized": false,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": false
596
+ },
597
+ "74": {
598
+ "content": "<unused67>",
599
+ "lstrip": false,
600
+ "normalized": false,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": false
604
+ },
605
+ "75": {
606
+ "content": "<unused68>",
607
+ "lstrip": false,
608
+ "normalized": false,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": false
612
+ },
613
+ "76": {
614
+ "content": "<unused69>",
615
+ "lstrip": false,
616
+ "normalized": false,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": false
620
+ },
621
+ "77": {
622
+ "content": "<unused70>",
623
+ "lstrip": false,
624
+ "normalized": false,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": false
628
+ },
629
+ "78": {
630
+ "content": "<unused71>",
631
+ "lstrip": false,
632
+ "normalized": false,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": false
636
+ },
637
+ "79": {
638
+ "content": "<unused72>",
639
+ "lstrip": false,
640
+ "normalized": false,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": false
644
+ },
645
+ "80": {
646
+ "content": "<unused73>",
647
+ "lstrip": false,
648
+ "normalized": false,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": false
652
+ },
653
+ "81": {
654
+ "content": "<unused74>",
655
+ "lstrip": false,
656
+ "normalized": false,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": false
660
+ },
661
+ "82": {
662
+ "content": "<unused75>",
663
+ "lstrip": false,
664
+ "normalized": false,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": false
668
+ },
669
+ "83": {
670
+ "content": "<unused76>",
671
+ "lstrip": false,
672
+ "normalized": false,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": false
676
+ },
677
+ "84": {
678
+ "content": "<unused77>",
679
+ "lstrip": false,
680
+ "normalized": false,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": false
684
+ },
685
+ "85": {
686
+ "content": "<unused78>",
687
+ "lstrip": false,
688
+ "normalized": false,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": false
692
+ },
693
+ "86": {
694
+ "content": "<unused79>",
695
+ "lstrip": false,
696
+ "normalized": false,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": false
700
+ },
701
+ "87": {
702
+ "content": "<unused80>",
703
+ "lstrip": false,
704
+ "normalized": false,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": false
708
+ },
709
+ "88": {
710
+ "content": "<unused81>",
711
+ "lstrip": false,
712
+ "normalized": false,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": false
716
+ },
717
+ "89": {
718
+ "content": "<unused82>",
719
+ "lstrip": false,
720
+ "normalized": false,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": false
724
+ },
725
+ "90": {
726
+ "content": "<unused83>",
727
+ "lstrip": false,
728
+ "normalized": false,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": false
732
+ },
733
+ "91": {
734
+ "content": "<unused84>",
735
+ "lstrip": false,
736
+ "normalized": false,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": false
740
+ },
741
+ "92": {
742
+ "content": "<unused85>",
743
+ "lstrip": false,
744
+ "normalized": false,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": false
748
+ },
749
+ "93": {
750
+ "content": "<unused86>",
751
+ "lstrip": false,
752
+ "normalized": false,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": false
756
+ },
757
+ "94": {
758
+ "content": "<unused87>",
759
+ "lstrip": false,
760
+ "normalized": false,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": false
764
+ },
765
+ "95": {
766
+ "content": "<unused88>",
767
+ "lstrip": false,
768
+ "normalized": false,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": false
772
+ },
773
+ "96": {
774
+ "content": "<unused89>",
775
+ "lstrip": false,
776
+ "normalized": false,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": false
780
+ },
781
+ "97": {
782
+ "content": "<unused90>",
783
+ "lstrip": false,
784
+ "normalized": false,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": false
788
+ },
789
+ "98": {
790
+ "content": "<unused91>",
791
+ "lstrip": false,
792
+ "normalized": false,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": false
796
+ },
797
+ "99": {
798
+ "content": "<unused92>",
799
+ "lstrip": false,
800
+ "normalized": false,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": false
804
+ },
805
+ "100": {
806
+ "content": "<unused93>",
807
+ "lstrip": false,
808
+ "normalized": false,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": false
812
+ },
813
+ "101": {
814
+ "content": "<unused94>",
815
+ "lstrip": false,
816
+ "normalized": false,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": false
820
+ },
821
+ "102": {
822
+ "content": "<unused95>",
823
+ "lstrip": false,
824
+ "normalized": false,
825
+ "rstrip": false,
826
+ "single_word": false,
827
+ "special": false
828
+ },
829
+ "103": {
830
+ "content": "<unused96>",
831
+ "lstrip": false,
832
+ "normalized": false,
833
+ "rstrip": false,
834
+ "single_word": false,
835
+ "special": false
836
+ },
837
+ "104": {
838
+ "content": "<unused97>",
839
+ "lstrip": false,
840
+ "normalized": false,
841
+ "rstrip": false,
842
+ "single_word": false,
843
+ "special": false
844
+ },
845
+ "105": {
846
+ "content": "<unused98>",
847
+ "lstrip": false,
848
+ "normalized": false,
849
+ "rstrip": false,
850
+ "single_word": false,
851
+ "special": false
852
+ },
853
+ "106": {
854
+ "content": "<start_of_turn>",
855
+ "lstrip": false,
856
+ "normalized": false,
857
+ "rstrip": false,
858
+ "single_word": false,
859
+ "special": true
860
+ },
861
+ "107": {
862
+ "content": "<end_of_turn>",
863
+ "lstrip": false,
864
+ "normalized": false,
865
+ "rstrip": false,
866
+ "single_word": false,
867
+ "special": true
868
+ },
869
+ "108": {
870
+ "content": "\n",
871
+ "lstrip": false,
872
+ "normalized": false,
873
+ "rstrip": false,
874
+ "single_word": false,
875
+ "special": false
876
+ },
877
+ "109": {
878
+ "content": "\n\n",
879
+ "lstrip": false,
880
+ "normalized": false,
881
+ "rstrip": false,
882
+ "single_word": false,
883
+ "special": false
884
+ },
885
+ "110": {
886
+ "content": "\n\n\n",
887
+ "lstrip": false,
888
+ "normalized": false,
889
+ "rstrip": false,
890
+ "single_word": false,
891
+ "special": false
892
+ },
893
+ "111": {
894
+ "content": "\n\n\n\n",
895
+ "lstrip": false,
896
+ "normalized": false,
897
+ "rstrip": false,
898
+ "single_word": false,
899
+ "special": false
900
+ },
901
+ "112": {
902
+ "content": "\n\n\n\n\n",
903
+ "lstrip": false,
904
+ "normalized": false,
905
+ "rstrip": false,
906
+ "single_word": false,
907
+ "special": false
908
+ },
909
+ "113": {
910
+ "content": "\n\n\n\n\n\n",
911
+ "lstrip": false,
912
+ "normalized": false,
913
+ "rstrip": false,
914
+ "single_word": false,
915
+ "special": false
916
+ },
917
+ "114": {
918
+ "content": "\n\n\n\n\n\n\n",
919
+ "lstrip": false,
920
+ "normalized": false,
921
+ "rstrip": false,
922
+ "single_word": false,
923
+ "special": false
924
+ },
925
+ "115": {
926
+ "content": "\n\n\n\n\n\n\n\n",
927
+ "lstrip": false,
928
+ "normalized": false,
929
+ "rstrip": false,
930
+ "single_word": false,
931
+ "special": false
932
+ },
933
+ "116": {
934
+ "content": "\n\n\n\n\n\n\n\n\n",
935
+ "lstrip": false,
936
+ "normalized": false,
937
+ "rstrip": false,
938
+ "single_word": false,
939
+ "special": false
940
+ },
941
+ "117": {
942
+ "content": "\n\n\n\n\n\n\n\n\n\n",
943
+ "lstrip": false,
944
+ "normalized": false,
945
+ "rstrip": false,
946
+ "single_word": false,
947
+ "special": false
948
+ },
949
+ "118": {
950
+ "content": "\n\n\n\n\n\n\n\n\n\n\n",
951
+ "lstrip": false,
952
+ "normalized": false,
953
+ "rstrip": false,
954
+ "single_word": false,
955
+ "special": false
956
+ },
957
+ "119": {
958
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n",
959
+ "lstrip": false,
960
+ "normalized": false,
961
+ "rstrip": false,
962
+ "single_word": false,
963
+ "special": false
964
+ },
965
+ "120": {
966
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n",
967
+ "lstrip": false,
968
+ "normalized": false,
969
+ "rstrip": false,
970
+ "single_word": false,
971
+ "special": false
972
+ },
973
+ "121": {
974
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
975
+ "lstrip": false,
976
+ "normalized": false,
977
+ "rstrip": false,
978
+ "single_word": false,
979
+ "special": false
980
+ },
981
+ "122": {
982
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
983
+ "lstrip": false,
984
+ "normalized": false,
985
+ "rstrip": false,
986
+ "single_word": false,
987
+ "special": false
988
+ },
989
+ "123": {
990
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
991
+ "lstrip": false,
992
+ "normalized": false,
993
+ "rstrip": false,
994
+ "single_word": false,
995
+ "special": false
996
+ },
997
+ "124": {
998
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
999
+ "lstrip": false,
1000
+ "normalized": false,
1001
+ "rstrip": false,
1002
+ "single_word": false,
1003
+ "special": false
1004
+ },
1005
+ "125": {
1006
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1007
+ "lstrip": false,
1008
+ "normalized": false,
1009
+ "rstrip": false,
1010
+ "single_word": false,
1011
+ "special": false
1012
+ },
1013
+ "126": {
1014
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1015
+ "lstrip": false,
1016
+ "normalized": false,
1017
+ "rstrip": false,
1018
+ "single_word": false,
1019
+ "special": false
1020
+ },
1021
+ "127": {
1022
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1023
+ "lstrip": false,
1024
+ "normalized": false,
1025
+ "rstrip": false,
1026
+ "single_word": false,
1027
+ "special": false
1028
+ },
1029
+ "128": {
1030
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1031
+ "lstrip": false,
1032
+ "normalized": false,
1033
+ "rstrip": false,
1034
+ "single_word": false,
1035
+ "special": false
1036
+ },
1037
+ "129": {
1038
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1039
+ "lstrip": false,
1040
+ "normalized": false,
1041
+ "rstrip": false,
1042
+ "single_word": false,
1043
+ "special": false
1044
+ },
1045
+ "130": {
1046
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1047
+ "lstrip": false,
1048
+ "normalized": false,
1049
+ "rstrip": false,
1050
+ "single_word": false,
1051
+ "special": false
1052
+ },
1053
+ "131": {
1054
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1055
+ "lstrip": false,
1056
+ "normalized": false,
1057
+ "rstrip": false,
1058
+ "single_word": false,
1059
+ "special": false
1060
+ },
1061
+ "132": {
1062
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1063
+ "lstrip": false,
1064
+ "normalized": false,
1065
+ "rstrip": false,
1066
+ "single_word": false,
1067
+ "special": false
1068
+ },
1069
+ "133": {
1070
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1071
+ "lstrip": false,
1072
+ "normalized": false,
1073
+ "rstrip": false,
1074
+ "single_word": false,
1075
+ "special": false
1076
+ },
1077
+ "134": {
1078
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1079
+ "lstrip": false,
1080
+ "normalized": false,
1081
+ "rstrip": false,
1082
+ "single_word": false,
1083
+ "special": false
1084
+ },
1085
+ "135": {
1086
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1087
+ "lstrip": false,
1088
+ "normalized": false,
1089
+ "rstrip": false,
1090
+ "single_word": false,
1091
+ "special": false
1092
+ },
1093
+ "136": {
1094
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1095
+ "lstrip": false,
1096
+ "normalized": false,
1097
+ "rstrip": false,
1098
+ "single_word": false,
1099
+ "special": false
1100
+ },
1101
+ "137": {
1102
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1103
+ "lstrip": false,
1104
+ "normalized": false,
1105
+ "rstrip": false,
1106
+ "single_word": false,
1107
+ "special": false
1108
+ },
1109
+ "138": {
1110
+ "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
1111
+ "lstrip": false,
1112
+ "normalized": false,
1113
+ "rstrip": false,
1114
+ "single_word": false,
1115
+ "special": false
1116
+ },
1117
+ "139": {
1118
+ "content": "▁▁",
1119
+ "lstrip": false,
1120
+ "normalized": false,
1121
+ "rstrip": false,
1122
+ "single_word": false,
1123
+ "special": false
1124
+ },
1125
+ "140": {
1126
+ "content": "▁▁▁",
1127
+ "lstrip": false,
1128
+ "normalized": false,
1129
+ "rstrip": false,
1130
+ "single_word": false,
1131
+ "special": false
1132
+ },
1133
+ "141": {
1134
+ "content": "▁▁▁▁",
1135
+ "lstrip": false,
1136
+ "normalized": false,
1137
+ "rstrip": false,
1138
+ "single_word": false,
1139
+ "special": false
1140
+ },
1141
+ "142": {
1142
+ "content": "▁▁▁▁▁",
1143
+ "lstrip": false,
1144
+ "normalized": false,
1145
+ "rstrip": false,
1146
+ "single_word": false,
1147
+ "special": false
1148
+ },
1149
+ "143": {
1150
+ "content": "▁▁▁▁▁▁",
1151
+ "lstrip": false,
1152
+ "normalized": false,
1153
+ "rstrip": false,
1154
+ "single_word": false,
1155
+ "special": false
1156
+ },
1157
+ "144": {
1158
+ "content": "▁▁▁▁▁▁▁",
1159
+ "lstrip": false,
1160
+ "normalized": false,
1161
+ "rstrip": false,
1162
+ "single_word": false,
1163
+ "special": false
1164
+ },
1165
+ "145": {
1166
+ "content": "▁▁▁▁▁▁▁▁",
1167
+ "lstrip": false,
1168
+ "normalized": false,
1169
+ "rstrip": false,
1170
+ "single_word": false,
1171
+ "special": false
1172
+ },
1173
+ "146": {
1174
+ "content": "▁▁▁▁▁▁▁▁▁",
1175
+ "lstrip": false,
1176
+ "normalized": false,
1177
+ "rstrip": false,
1178
+ "single_word": false,
1179
+ "special": false
1180
+ },
1181
+ "147": {
1182
+ "content": "▁▁▁▁▁▁▁▁▁▁",
1183
+ "lstrip": false,
1184
+ "normalized": false,
1185
+ "rstrip": false,
1186
+ "single_word": false,
1187
+ "special": false
1188
+ },
1189
+ "148": {
1190
+ "content": "▁▁▁▁▁▁▁▁▁▁▁",
1191
+ "lstrip": false,
1192
+ "normalized": false,
1193
+ "rstrip": false,
1194
+ "single_word": false,
1195
+ "special": false
1196
+ },
1197
+ "149": {
1198
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁",
1199
+ "lstrip": false,
1200
+ "normalized": false,
1201
+ "rstrip": false,
1202
+ "single_word": false,
1203
+ "special": false
1204
+ },
1205
+ "150": {
1206
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁",
1207
+ "lstrip": false,
1208
+ "normalized": false,
1209
+ "rstrip": false,
1210
+ "single_word": false,
1211
+ "special": false
1212
+ },
1213
+ "151": {
1214
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1215
+ "lstrip": false,
1216
+ "normalized": false,
1217
+ "rstrip": false,
1218
+ "single_word": false,
1219
+ "special": false
1220
+ },
1221
+ "152": {
1222
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1223
+ "lstrip": false,
1224
+ "normalized": false,
1225
+ "rstrip": false,
1226
+ "single_word": false,
1227
+ "special": false
1228
+ },
1229
+ "153": {
1230
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1231
+ "lstrip": false,
1232
+ "normalized": false,
1233
+ "rstrip": false,
1234
+ "single_word": false,
1235
+ "special": false
1236
+ },
1237
+ "154": {
1238
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1239
+ "lstrip": false,
1240
+ "normalized": false,
1241
+ "rstrip": false,
1242
+ "single_word": false,
1243
+ "special": false
1244
+ },
1245
+ "155": {
1246
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1247
+ "lstrip": false,
1248
+ "normalized": false,
1249
+ "rstrip": false,
1250
+ "single_word": false,
1251
+ "special": false
1252
+ },
1253
+ "156": {
1254
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1255
+ "lstrip": false,
1256
+ "normalized": false,
1257
+ "rstrip": false,
1258
+ "single_word": false,
1259
+ "special": false
1260
+ },
1261
+ "157": {
1262
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1263
+ "lstrip": false,
1264
+ "normalized": false,
1265
+ "rstrip": false,
1266
+ "single_word": false,
1267
+ "special": false
1268
+ },
1269
+ "158": {
1270
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1271
+ "lstrip": false,
1272
+ "normalized": false,
1273
+ "rstrip": false,
1274
+ "single_word": false,
1275
+ "special": false
1276
+ },
1277
+ "159": {
1278
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1279
+ "lstrip": false,
1280
+ "normalized": false,
1281
+ "rstrip": false,
1282
+ "single_word": false,
1283
+ "special": false
1284
+ },
1285
+ "160": {
1286
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1287
+ "lstrip": false,
1288
+ "normalized": false,
1289
+ "rstrip": false,
1290
+ "single_word": false,
1291
+ "special": false
1292
+ },
1293
+ "161": {
1294
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1295
+ "lstrip": false,
1296
+ "normalized": false,
1297
+ "rstrip": false,
1298
+ "single_word": false,
1299
+ "special": false
1300
+ },
1301
+ "162": {
1302
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1303
+ "lstrip": false,
1304
+ "normalized": false,
1305
+ "rstrip": false,
1306
+ "single_word": false,
1307
+ "special": false
1308
+ },
1309
+ "163": {
1310
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1311
+ "lstrip": false,
1312
+ "normalized": false,
1313
+ "rstrip": false,
1314
+ "single_word": false,
1315
+ "special": false
1316
+ },
1317
+ "164": {
1318
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1319
+ "lstrip": false,
1320
+ "normalized": false,
1321
+ "rstrip": false,
1322
+ "single_word": false,
1323
+ "special": false
1324
+ },
1325
+ "165": {
1326
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1327
+ "lstrip": false,
1328
+ "normalized": false,
1329
+ "rstrip": false,
1330
+ "single_word": false,
1331
+ "special": false
1332
+ },
1333
+ "166": {
1334
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1335
+ "lstrip": false,
1336
+ "normalized": false,
1337
+ "rstrip": false,
1338
+ "single_word": false,
1339
+ "special": false
1340
+ },
1341
+ "167": {
1342
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1343
+ "lstrip": false,
1344
+ "normalized": false,
1345
+ "rstrip": false,
1346
+ "single_word": false,
1347
+ "special": false
1348
+ },
1349
+ "168": {
1350
+ "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
1351
+ "lstrip": false,
1352
+ "normalized": false,
1353
+ "rstrip": false,
1354
+ "single_word": false,
1355
+ "special": false
1356
+ },
1357
+ "169": {
1358
+ "content": "<table>",
1359
+ "lstrip": false,
1360
+ "normalized": false,
1361
+ "rstrip": false,
1362
+ "single_word": false,
1363
+ "special": false
1364
+ },
1365
+ "170": {
1366
+ "content": "<caption>",
1367
+ "lstrip": false,
1368
+ "normalized": false,
1369
+ "rstrip": false,
1370
+ "single_word": false,
1371
+ "special": false
1372
+ },
1373
+ "171": {
1374
+ "content": "<thead>",
1375
+ "lstrip": false,
1376
+ "normalized": false,
1377
+ "rstrip": false,
1378
+ "single_word": false,
1379
+ "special": false
1380
+ },
1381
+ "172": {
1382
+ "content": "<tbody>",
1383
+ "lstrip": false,
1384
+ "normalized": false,
1385
+ "rstrip": false,
1386
+ "single_word": false,
1387
+ "special": false
1388
+ },
1389
+ "173": {
1390
+ "content": "<tfoot>",
1391
+ "lstrip": false,
1392
+ "normalized": false,
1393
+ "rstrip": false,
1394
+ "single_word": false,
1395
+ "special": false
1396
+ },
1397
+ "174": {
1398
+ "content": "<tr>",
1399
+ "lstrip": false,
1400
+ "normalized": false,
1401
+ "rstrip": false,
1402
+ "single_word": false,
1403
+ "special": false
1404
+ },
1405
+ "175": {
1406
+ "content": "<th>",
1407
+ "lstrip": false,
1408
+ "normalized": false,
1409
+ "rstrip": false,
1410
+ "single_word": false,
1411
+ "special": false
1412
+ },
1413
+ "176": {
1414
+ "content": "<td>",
1415
+ "lstrip": false,
1416
+ "normalized": false,
1417
+ "rstrip": false,
1418
+ "single_word": false,
1419
+ "special": false
1420
+ },
1421
+ "177": {
1422
+ "content": "</table>",
1423
+ "lstrip": false,
1424
+ "normalized": false,
1425
+ "rstrip": false,
1426
+ "single_word": false,
1427
+ "special": false
1428
+ },
1429
+ "178": {
1430
+ "content": "</caption>",
1431
+ "lstrip": false,
1432
+ "normalized": false,
1433
+ "rstrip": false,
1434
+ "single_word": false,
1435
+ "special": false
1436
+ },
1437
+ "179": {
1438
+ "content": "</thead>",
1439
+ "lstrip": false,
1440
+ "normalized": false,
1441
+ "rstrip": false,
1442
+ "single_word": false,
1443
+ "special": false
1444
+ },
1445
+ "180": {
1446
+ "content": "</tbody>",
1447
+ "lstrip": false,
1448
+ "normalized": false,
1449
+ "rstrip": false,
1450
+ "single_word": false,
1451
+ "special": false
1452
+ },
1453
+ "181": {
1454
+ "content": "</tfoot>",
1455
+ "lstrip": false,
1456
+ "normalized": false,
1457
+ "rstrip": false,
1458
+ "single_word": false,
1459
+ "special": false
1460
+ },
1461
+ "182": {
1462
+ "content": "</tr>",
1463
+ "lstrip": false,
1464
+ "normalized": false,
1465
+ "rstrip": false,
1466
+ "single_word": false,
1467
+ "special": false
1468
+ },
1469
+ "183": {
1470
+ "content": "</th>",
1471
+ "lstrip": false,
1472
+ "normalized": false,
1473
+ "rstrip": false,
1474
+ "single_word": false,
1475
+ "special": false
1476
+ },
1477
+ "184": {
1478
+ "content": "</td>",
1479
+ "lstrip": false,
1480
+ "normalized": false,
1481
+ "rstrip": false,
1482
+ "single_word": false,
1483
+ "special": false
1484
+ },
1485
+ "185": {
1486
+ "content": "<h1>",
1487
+ "lstrip": false,
1488
+ "normalized": false,
1489
+ "rstrip": false,
1490
+ "single_word": false,
1491
+ "special": false
1492
+ },
1493
+ "186": {
1494
+ "content": "<h2>",
1495
+ "lstrip": false,
1496
+ "normalized": false,
1497
+ "rstrip": false,
1498
+ "single_word": false,
1499
+ "special": false
1500
+ },
1501
+ "187": {
1502
+ "content": "<h3>",
1503
+ "lstrip": false,
1504
+ "normalized": false,
1505
+ "rstrip": false,
1506
+ "single_word": false,
1507
+ "special": false
1508
+ },
1509
+ "188": {
1510
+ "content": "<h4>",
1511
+ "lstrip": false,
1512
+ "normalized": false,
1513
+ "rstrip": false,
1514
+ "single_word": false,
1515
+ "special": false
1516
+ },
1517
+ "189": {
1518
+ "content": "<h5>",
1519
+ "lstrip": false,
1520
+ "normalized": false,
1521
+ "rstrip": false,
1522
+ "single_word": false,
1523
+ "special": false
1524
+ },
1525
+ "190": {
1526
+ "content": "<h6>",
1527
+ "lstrip": false,
1528
+ "normalized": false,
1529
+ "rstrip": false,
1530
+ "single_word": false,
1531
+ "special": false
1532
+ },
1533
+ "191": {
1534
+ "content": "<blockquote>",
1535
+ "lstrip": false,
1536
+ "normalized": false,
1537
+ "rstrip": false,
1538
+ "single_word": false,
1539
+ "special": false
1540
+ },
1541
+ "192": {
1542
+ "content": "</h1>",
1543
+ "lstrip": false,
1544
+ "normalized": false,
1545
+ "rstrip": false,
1546
+ "single_word": false,
1547
+ "special": false
1548
+ },
1549
+ "193": {
1550
+ "content": "</h2>",
1551
+ "lstrip": false,
1552
+ "normalized": false,
1553
+ "rstrip": false,
1554
+ "single_word": false,
1555
+ "special": false
1556
+ },
1557
+ "194": {
1558
+ "content": "</h3>",
1559
+ "lstrip": false,
1560
+ "normalized": false,
1561
+ "rstrip": false,
1562
+ "single_word": false,
1563
+ "special": false
1564
+ },
1565
+ "195": {
1566
+ "content": "</h4>",
1567
+ "lstrip": false,
1568
+ "normalized": false,
1569
+ "rstrip": false,
1570
+ "single_word": false,
1571
+ "special": false
1572
+ },
1573
+ "196": {
1574
+ "content": "</h5>",
1575
+ "lstrip": false,
1576
+ "normalized": false,
1577
+ "rstrip": false,
1578
+ "single_word": false,
1579
+ "special": false
1580
+ },
1581
+ "197": {
1582
+ "content": "</h6>",
1583
+ "lstrip": false,
1584
+ "normalized": false,
1585
+ "rstrip": false,
1586
+ "single_word": false,
1587
+ "special": false
1588
+ },
1589
+ "198": {
1590
+ "content": "</blockquote>",
1591
+ "lstrip": false,
1592
+ "normalized": false,
1593
+ "rstrip": false,
1594
+ "single_word": false,
1595
+ "special": false
1596
+ },
1597
+ "199": {
1598
+ "content": "<strong>",
1599
+ "lstrip": false,
1600
+ "normalized": false,
1601
+ "rstrip": false,
1602
+ "single_word": false,
1603
+ "special": false
1604
+ },
1605
+ "200": {
1606
+ "content": "<em>",
1607
+ "lstrip": false,
1608
+ "normalized": false,
1609
+ "rstrip": false,
1610
+ "single_word": false,
1611
+ "special": false
1612
+ },
1613
+ "201": {
1614
+ "content": "<b>",
1615
+ "lstrip": false,
1616
+ "normalized": false,
1617
+ "rstrip": false,
1618
+ "single_word": false,
1619
+ "special": false
1620
+ },
1621
+ "202": {
1622
+ "content": "<i>",
1623
+ "lstrip": false,
1624
+ "normalized": false,
1625
+ "rstrip": false,
1626
+ "single_word": false,
1627
+ "special": false
1628
+ },
1629
+ "203": {
1630
+ "content": "<u>",
1631
+ "lstrip": false,
1632
+ "normalized": false,
1633
+ "rstrip": false,
1634
+ "single_word": false,
1635
+ "special": false
1636
+ },
1637
+ "204": {
1638
+ "content": "<s>",
1639
+ "lstrip": false,
1640
+ "normalized": false,
1641
+ "rstrip": false,
1642
+ "single_word": false,
1643
+ "special": false
1644
+ },
1645
+ "205": {
1646
+ "content": "<sub>",
1647
+ "lstrip": false,
1648
+ "normalized": false,
1649
+ "rstrip": false,
1650
+ "single_word": false,
1651
+ "special": false
1652
+ },
1653
+ "206": {
1654
+ "content": "<sup>",
1655
+ "lstrip": false,
1656
+ "normalized": false,
1657
+ "rstrip": false,
1658
+ "single_word": false,
1659
+ "special": false
1660
+ },
1661
+ "207": {
1662
+ "content": "<code>",
1663
+ "lstrip": false,
1664
+ "normalized": false,
1665
+ "rstrip": false,
1666
+ "single_word": false,
1667
+ "special": false
1668
+ },
1669
+ "208": {
1670
+ "content": "</strong>",
1671
+ "lstrip": false,
1672
+ "normalized": false,
1673
+ "rstrip": false,
1674
+ "single_word": false,
1675
+ "special": false
1676
+ },
1677
+ "209": {
1678
+ "content": "</em>",
1679
+ "lstrip": false,
1680
+ "normalized": false,
1681
+ "rstrip": false,
1682
+ "single_word": false,
1683
+ "special": false
1684
+ },
1685
+ "210": {
1686
+ "content": "</b>",
1687
+ "lstrip": false,
1688
+ "normalized": false,
1689
+ "rstrip": false,
1690
+ "single_word": false,
1691
+ "special": false
1692
+ },
1693
+ "211": {
1694
+ "content": "</i>",
1695
+ "lstrip": false,
1696
+ "normalized": false,
1697
+ "rstrip": false,
1698
+ "single_word": false,
1699
+ "special": false
1700
+ },
1701
+ "212": {
1702
+ "content": "</u>",
1703
+ "lstrip": false,
1704
+ "normalized": false,
1705
+ "rstrip": false,
1706
+ "single_word": false,
1707
+ "special": false
1708
+ },
1709
+ "213": {
1710
+ "content": "</s>",
1711
+ "lstrip": false,
1712
+ "normalized": false,
1713
+ "rstrip": false,
1714
+ "single_word": false,
1715
+ "special": false
1716
+ },
1717
+ "214": {
1718
+ "content": "</sub>",
1719
+ "lstrip": false,
1720
+ "normalized": false,
1721
+ "rstrip": false,
1722
+ "single_word": false,
1723
+ "special": false
1724
+ },
1725
+ "215": {
1726
+ "content": "</sup>",
1727
+ "lstrip": false,
1728
+ "normalized": false,
1729
+ "rstrip": false,
1730
+ "single_word": false,
1731
+ "special": false
1732
+ },
1733
+ "216": {
1734
+ "content": "</code>",
1735
+ "lstrip": false,
1736
+ "normalized": false,
1737
+ "rstrip": false,
1738
+ "single_word": false,
1739
+ "special": false
1740
+ },
1741
+ "255968": {
1742
+ "content": "[toxicity=0]",
1743
+ "lstrip": false,
1744
+ "normalized": false,
1745
+ "rstrip": false,
1746
+ "single_word": false,
1747
+ "special": false
1748
+ },
1749
+ "255969": {
1750
+ "content": "\t\t",
1751
+ "lstrip": false,
1752
+ "normalized": false,
1753
+ "rstrip": false,
1754
+ "single_word": false,
1755
+ "special": false
1756
+ },
1757
+ "255970": {
1758
+ "content": "\t\t\t",
1759
+ "lstrip": false,
1760
+ "normalized": false,
1761
+ "rstrip": false,
1762
+ "single_word": false,
1763
+ "special": false
1764
+ },
1765
+ "255971": {
1766
+ "content": "\t\t\t\t",
1767
+ "lstrip": false,
1768
+ "normalized": false,
1769
+ "rstrip": false,
1770
+ "single_word": false,
1771
+ "special": false
1772
+ },
1773
+ "255972": {
1774
+ "content": "\t\t\t\t\t",
1775
+ "lstrip": false,
1776
+ "normalized": false,
1777
+ "rstrip": false,
1778
+ "single_word": false,
1779
+ "special": false
1780
+ },
1781
+ "255973": {
1782
+ "content": "\t\t\t\t\t\t",
1783
+ "lstrip": false,
1784
+ "normalized": false,
1785
+ "rstrip": false,
1786
+ "single_word": false,
1787
+ "special": false
1788
+ },
1789
+ "255974": {
1790
+ "content": "\t\t\t\t\t\t\t",
1791
+ "lstrip": false,
1792
+ "normalized": false,
1793
+ "rstrip": false,
1794
+ "single_word": false,
1795
+ "special": false
1796
+ },
1797
+ "255975": {
1798
+ "content": "\t\t\t\t\t\t\t\t",
1799
+ "lstrip": false,
1800
+ "normalized": false,
1801
+ "rstrip": false,
1802
+ "single_word": false,
1803
+ "special": false
1804
+ },
1805
+ "255976": {
1806
+ "content": "\t\t\t\t\t\t\t\t\t",
1807
+ "lstrip": false,
1808
+ "normalized": false,
1809
+ "rstrip": false,
1810
+ "single_word": false,
1811
+ "special": false
1812
+ },
1813
+ "255977": {
1814
+ "content": "\t\t\t\t\t\t\t\t\t\t",
1815
+ "lstrip": false,
1816
+ "normalized": false,
1817
+ "rstrip": false,
1818
+ "single_word": false,
1819
+ "special": false
1820
+ },
1821
+ "255978": {
1822
+ "content": "\t\t\t\t\t\t\t\t\t\t\t",
1823
+ "lstrip": false,
1824
+ "normalized": false,
1825
+ "rstrip": false,
1826
+ "single_word": false,
1827
+ "special": false
1828
+ },
1829
+ "255979": {
1830
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t",
1831
+ "lstrip": false,
1832
+ "normalized": false,
1833
+ "rstrip": false,
1834
+ "single_word": false,
1835
+ "special": false
1836
+ },
1837
+ "255980": {
1838
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t",
1839
+ "lstrip": false,
1840
+ "normalized": false,
1841
+ "rstrip": false,
1842
+ "single_word": false,
1843
+ "special": false
1844
+ },
1845
+ "255981": {
1846
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1847
+ "lstrip": false,
1848
+ "normalized": false,
1849
+ "rstrip": false,
1850
+ "single_word": false,
1851
+ "special": false
1852
+ },
1853
+ "255982": {
1854
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1855
+ "lstrip": false,
1856
+ "normalized": false,
1857
+ "rstrip": false,
1858
+ "single_word": false,
1859
+ "special": false
1860
+ },
1861
+ "255983": {
1862
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1863
+ "lstrip": false,
1864
+ "normalized": false,
1865
+ "rstrip": false,
1866
+ "single_word": false,
1867
+ "special": false
1868
+ },
1869
+ "255984": {
1870
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1871
+ "lstrip": false,
1872
+ "normalized": false,
1873
+ "rstrip": false,
1874
+ "single_word": false,
1875
+ "special": false
1876
+ },
1877
+ "255985": {
1878
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1879
+ "lstrip": false,
1880
+ "normalized": false,
1881
+ "rstrip": false,
1882
+ "single_word": false,
1883
+ "special": false
1884
+ },
1885
+ "255986": {
1886
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1887
+ "lstrip": false,
1888
+ "normalized": false,
1889
+ "rstrip": false,
1890
+ "single_word": false,
1891
+ "special": false
1892
+ },
1893
+ "255987": {
1894
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1895
+ "lstrip": false,
1896
+ "normalized": false,
1897
+ "rstrip": false,
1898
+ "single_word": false,
1899
+ "special": false
1900
+ },
1901
+ "255988": {
1902
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1903
+ "lstrip": false,
1904
+ "normalized": false,
1905
+ "rstrip": false,
1906
+ "single_word": false,
1907
+ "special": false
1908
+ },
1909
+ "255989": {
1910
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1911
+ "lstrip": false,
1912
+ "normalized": false,
1913
+ "rstrip": false,
1914
+ "single_word": false,
1915
+ "special": false
1916
+ },
1917
+ "255990": {
1918
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1919
+ "lstrip": false,
1920
+ "normalized": false,
1921
+ "rstrip": false,
1922
+ "single_word": false,
1923
+ "special": false
1924
+ },
1925
+ "255991": {
1926
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1927
+ "lstrip": false,
1928
+ "normalized": false,
1929
+ "rstrip": false,
1930
+ "single_word": false,
1931
+ "special": false
1932
+ },
1933
+ "255992": {
1934
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1935
+ "lstrip": false,
1936
+ "normalized": false,
1937
+ "rstrip": false,
1938
+ "single_word": false,
1939
+ "special": false
1940
+ },
1941
+ "255993": {
1942
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1943
+ "lstrip": false,
1944
+ "normalized": false,
1945
+ "rstrip": false,
1946
+ "single_word": false,
1947
+ "special": false
1948
+ },
1949
+ "255994": {
1950
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1951
+ "lstrip": false,
1952
+ "normalized": false,
1953
+ "rstrip": false,
1954
+ "single_word": false,
1955
+ "special": false
1956
+ },
1957
+ "255995": {
1958
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1959
+ "lstrip": false,
1960
+ "normalized": false,
1961
+ "rstrip": false,
1962
+ "single_word": false,
1963
+ "special": false
1964
+ },
1965
+ "255996": {
1966
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1967
+ "lstrip": false,
1968
+ "normalized": false,
1969
+ "rstrip": false,
1970
+ "single_word": false,
1971
+ "special": false
1972
+ },
1973
+ "255997": {
1974
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1975
+ "lstrip": false,
1976
+ "normalized": false,
1977
+ "rstrip": false,
1978
+ "single_word": false,
1979
+ "special": false
1980
+ },
1981
+ "255998": {
1982
+ "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t",
1983
+ "lstrip": false,
1984
+ "normalized": false,
1985
+ "rstrip": false,
1986
+ "single_word": false,
1987
+ "special": false
1988
+ },
1989
+ "255999": {
1990
+ "content": "<unused99>",
1991
+ "lstrip": false,
1992
+ "normalized": false,
1993
+ "rstrip": false,
1994
+ "single_word": false,
1995
+ "special": false
1996
+ }
1997
+ },
1998
+ "additional_special_tokens": [
1999
+ "<start_of_turn>",
2000
+ "<end_of_turn>"
2001
+ ],
2002
+ "bos_token": "<bos>",
2003
+ "clean_up_tokenization_spaces": false,
2004
+ "eos_token": "<eos>",
2005
+ "extra_special_tokens": {},
2006
+ "model_max_length": 1000000000000000019884624838656,
2007
+ "pad_token": "<pad>",
2008
+ "padding_side": "right",
2009
+ "sp_model_kwargs": {},
2010
+ "spaces_between_special_tokens": false,
2011
+ "tokenizer_class": "GemmaTokenizer",
2012
+ "unk_token": "<unk>",
2013
+ "use_default_system_prompt": false
2014
+ }