ddyuudd commited on
Commit
d71af46
·
1 Parent(s): a124512

Upload CAT-Translate-7b model files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ CAT-logo.png filter=lfs diff=lfs merge=lfs -text
CAT-logo.png ADDED

Git LFS Details

  • SHA256: 0c6536ff0db067fd5975e8d8667293ac4de306de737e8a3465cabd47efcb5f11
  • Pointer size: 132 Bytes
  • Size of remote file: 1.33 MB
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 CyberAgent AI Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,143 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CAT-Translate 🐱
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
4
+ [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue)](https://huggingface.co/cyberagent/CAT-Translate-7b/)
5
+
6
+ Tiny Language Model For Japanese and English Bidirectional Translation
7
+
8
+ - **Purrs on your lap** 🐱: Small and efficient! 0.8-7B models that run on edge devices.
9
+ - **Swift and Feline Sharp** 🐾: Beats TranslateGemma-12B on text-to-text translation quality.
10
+ - **Adopt and adapt** 🐈: Open source (MIT License) models you can customize and extend.
11
+
12
+ <div align="center">
13
+ <img src="CAT-logo.png" alt="Cat sleeping on top of a laptop." width="200">
14
+ </div>
15
+
16
+ ## Models
17
+
18
+ All models are available on Hugging Face:
19
+
20
+ - [CAT-Translate-0.8B](https://huggingface.co/cyberagent/CAT-Translate-0.8b/)
21
+ - [CAT-Translate-1.4B](https://huggingface.co/cyberagent/CAT-Translate-1.4b/)
22
+ - [CAT-Translate-3.3B](https://huggingface.co/cyberagent/CAT-Translate-3.3b/)
23
+ - [CAT-Translate-7B](https://huggingface.co/cyberagent/CAT-Translate-7b/)
24
+
25
+ ## Evaluation
26
+
27
+ We conducted evaluation on the translation subsets of the following benchmarks:
28
+
29
+ - [The Business Scene Dialogue corpus](https://github.com/tsuruoka-lab/BSD) (BSD)
30
+ - Each conversation is given to the model to translate instead of each sentence.
31
+ - [Court Interpreter](https://github.com/mynlp/court_interpreter) (Court)
32
+ - [JMedBench](https://huggingface.co/datasets/Coldog2333/JMedBench) (JMed)
33
+ - ejmmt subsets are used.
34
+ - [pfmt-bench-fin-ja](https://github.com/pfnet-research/pfmt-bench-fin-ja) (PFMT)
35
+ - [WAT 2025 Patent Translation](https://sites.google.com/view/pat-claims-trans-2025/) (wat-pat-2025)
36
+
37
+ We chose these tasks as benchmarks because (1) they are derived from real world applications and (2) are less overoptimized compared to popular datasets (e.g., WMT).
38
+ The results are below.
39
+ All the models achieved the best scores among all models (including closed source) within their respective sizes for both En-Ja and Ja-En translation tasks.
40
+
41
+
42
+ | Model | Avg. BLEU | Avg. BLEU Ja->En | Avg. BLEU En->Ja | BSD (Ja-En) | Court (Ja-En) | JMed (Ja-En) | PFMT (Ja-En) | wat-pat-2025 (Ja-En) | BSD (En-Ja) | JMed (En-Ja) | PFMT (En-Ja) | wat-pat-2025 (En-Ja) |
43
+ |:-------------------------------------------------|----------:|-----------------:|-----------------:|------------:|--------------:|-------------:|-------------:|------------------:|------------:|-------------:|-------------:|------------------:|
44
+ | CyberAgent/CAT-Translate-7B | 37.68 | 41.06 | 34.31 | 33.75 | 45.29 | 30.65 | 49.86 | 45.74 | 16.29 | 29.62 | 52.94 | 38.37 |
45
+ | CyberAgent/CAT-Translate-3.3B | 36.16 | 37.51 | 34.80 | 26.51 | 42.44 | 24.47 | 49.93 | 44.23 | 17.21 | 28.67 | 53.88 | 39.44 |
46
+ | CyberAgent/CAT-Translate-1.4B | 33.73 | 33.26 | 34.19 | 31.28 | 43.84 | 24.08 | 36.55 | 30.57 | 15.71 | 26.92 | 51.53 | 42.58 |
47
+ | Unbabel/Tower-Plus-9B | 32.41 | 36.84 | 27.99 | 15.43 | 40.54 | 29.13 | 58.00 | 41.10 | 10.00 | 18.80 | 53.00 | 30.16 |
48
+ | google/translategemma-12b-it | 32.24 | 35.81 | 28.68 | 31.58 | 34.30 | 23.46 | 48.75 | 40.97 | 15.92 | 21.79 | 52.53 | 24.47 |
49
+ | CyberAgent/CAT-Translate-3.3B-beta | 30.60 | 30.32 | 30.88 | 17.20 | 38.65 | 23.96 | 40.58 | 31.22 | 16.63 | 26.68 | 53.40 | 26.80 |
50
+ | CyberAgent/CAT-Translate-0.8B | 30.42 | 29.71 | 30.68 | 29.63 | 33.19 | 22.96 | 32.51 | 30.56 | 14.60 | 26.22 | 50.62 | 32.87 |
51
+ | google/translategemma-4b-it | 28.09 | 29.41 | 26.76 | 28.86 | 25.89 | 21.50 | 42.65 | 28.16 | 14.14 | 20.68 | 51.99 | 20.23 |
52
+ | LiquidAI/LFM2.5-1.2B-JP | 25.47 | 24.51 | 26.43 | 19.06 | 29.99 | 22.10 | 43.61 | 7.80 | 14.57 | 23.85 | 54.77 | 12.54 |
53
+ | pfnet/plamo-2-translate | 25.24 | 25.92 | 24.57 | 25.55 | 28.63 | 22.90 | 29.02 | 23.48 | 17.35 | 24.98 | 32.04 | 23.89 |
54
+ | LiquidAI/LFM2-350M-ENJP-MT | 24.95 | 24.91 | 25.00 | 10.94 | 29.56 | 21.48 | 41.40 | 21.17 | 8.11 | 22.84 | 47.53 | 21.52 |
55
+ | mistralai/Ministral-8B-Instruct-2410 | 24.12 | 27.52 | 20.71 | 19.23 | 29.21 | 16.25 | 50.23 | 22.69 | 12.91 | 16.49 | 41.66 | 11.80 |
56
+ | nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese | 22.97 | 22.77 | 23.18 | 9.62 | 34.98 | 18.01 | 38.44 | 12.81 | 10.62 | 20.41 | 42.55 | 19.13 |
57
+ | Rakuten/RakutenAI-2.0-mini-instruct | 18.43 | 17.24 | 19.62 | 0.11 | 30.62 | 18.21 | 29.34 | 7.90 | 5.19 | 20.36 | 45.70 | 7.23 |
58
+ | SakanaAI/TinySwallow-1.5B-Instruct | 15.74 | 14.99 | 16.49 | 4.96 | 18.93 | 15.83 | 26.67 | 8.58 | 6.30 | 17.58 | 34.07 | 8.00 |
59
+ | llm-jp/llm-jp-3.1-1.8b-instruct4 | 15.18 | 16.26 | 14.11 | 18.82 | 2.44 | 15.67 | 30.65 | 13.72 | 15.38 | 4.91 | 25.47 | 10.65 |
60
+ | tencent/HY-MT1.5-1.8B | 14.49 | 8.95 | 20.04 | 5.50 | 4.59 | 4.00 | 15.67 | 14.98 | 6.33 | 18.13 | 37.75 | 17.96 |
61
+ | shisa-ai/shisa-v2.1-llama3.2-3b | 14.27 | 14.26 | 14.28 | 17.08 | 3.70 | 8.26 | 26.86 | 15.42 | 13.18 | 5.54 | 25.97 | 12.41 |
62
+ | google/gemma-2-2b-jpn-it | 14.15 | 16.98 | 11.32 | 20.04 | 8.08 | 11.27 | 31.49 | 14.01 | 12.37 | 4.48 | 16.24 | 12.21 |
63
+ | shisa-ai/shisa-v2.1-lfm2-1.2b | 13.08 | 14.02 | 12.14 | 20.93 | 4.95 | 7.68 | 26.72 | 9.80 | 12.11 | 5.54 | 17.60 | 13.30 |
64
+ | microsoft/phi-4 | 11.92 | 13.48 | 10.36 | 6.10 | 18.66 | 2.81 | 24.86 | 14.98 | 3.24 | 6.97 | 14.36 | 16.87 |
65
+ | tencent/HY-MT1.5-7B | 10.56 | 13.46 | 7.67 | 4.99 | 12.32 | 5.72 | 29.53 | 14.76 | 0.82 | 7.80 | 14.30 | 7.74 |
66
+ | tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.5 | 10.35 | 12.42 | 8.28 | 24.25 | 2.30 | 3.69 | 14.11 | 17.74 | 6.82 | 2.37 | 11.21 | 12.71 |
67
+ | Qwen/Qwen2.5-14B-Instruct | 8.39 | 9.88 | 6.89 | 10.81 | 4.70 | 4.27 | 11.18 | 18.46 | 4.01 | 3.69 | 13.42 | 6.42 |
68
+ | meta-llama/Llama-3.2-3B-Instruct | 6.06 | 9.90 | 2.23 | 18.60 | 0.41 | 2.72 | 16.62 | 11.17 | 1.44 | 1.10 | 4.50 | 1.87 |
69
+
70
+ A detailed experimental evaluation will be present in a technical report.
71
+
72
+ ## Usage
73
+
74
+ The model supports English to Japanese and Japanese to English translation with the following prompt format:
75
+
76
+ ```python
77
+ from transformers import pipeline
78
+
79
+ # Load the model
80
+ chat_pipeline = pipeline("text-generation", model="CyberAgent/CAT-Translate-7b")
81
+
82
+ # Define the prompt template
83
+ prompt = "Translate the following {src_lang} text into {tgt_lang}.\n\n{src_text}"
84
+
85
+ # Example: Japanese to English
86
+ src_lang = "Japanese"
87
+ tgt_lang = "English"
88
+ src_text = "🐈はとてもかわいいの。おててがまるくてふわふわなの。"
89
+
90
+ user_input = [{"role": "user", "content": prompt.format(src_lang=src_lang, tgt_lang=tgt_lang, src_text=src_text)}]
91
+
92
+ response = chat_pipeline(user_input)
93
+
94
+ print("-" * 20)
95
+ print("Source Text:")
96
+ print(src_text)
97
+ print("Translation:")
98
+ print(response[0]['generated_text'][-1]['content'])
99
+ ```
100
+
101
+ **Important**: You need to apply the chat template to run the model correctly.
102
+
103
+ ### Why Use Instructions?
104
+
105
+ Although the model is specialized for machine translation, we require an instruction prompt to invoke the translation capability. This design choice provides better customizability—extending and merging this model is easier this way. Since the model is open source, any extensions are welcome!
106
+
107
+ ## Training
108
+
109
+ This 7B model is based on an CyberAgent's in-house model developed by [Ryosuke Ishigami](https://huggingface.co/rishigami).
110
+
111
+ Our training process involved:
112
+ - Synthesizing datasets using large language models
113
+ - Multi-stage supervised fine-tuning (SFT) approach
114
+ - Reinforcement learning with [Multi-Objective GRPO (Ichihara et al. 2025)](https://arxiv.org/abs/2509.22047)
115
+
116
+ ## License
117
+
118
+ The model is licensed under the [MIT License](LICENSE).
119
+
120
+ ## Citation
121
+
122
+ ```bibtex
123
+ @misc{cat-translate-2026,
124
+ title={CAT-Translate: Tiny Language Model For Japanese and English Bidirectional Translation},
125
+ author={Yuu Jinnai},
126
+ year={2026},
127
+ url={https://huggingface.co/collections/cyberagent/cat-translate}
128
+ }
129
+ ```
130
+
131
+ ## Acknowledgments
132
+
133
+ This project stands on the shoulders of giants. In particular, the following resources significantly helped us develop the model:
134
+
135
+ - [Ryosuke Ishigami](https://huggingface.co/rishigami) for sharing the model
136
+ - [sarashina](https://huggingface.co/sbintuitions) by SB Intuitions
137
+ - [gpt-oss](https://huggingface.co/openai/gpt-oss-20b) by OpenAI
138
+ - [MetricX](https://huggingface.co/google/metricx-24-hybrid-xl-v2p6-bfloat16) by Juraj Juraska et al.
139
+ - [Duplodocus](https://github.com/allenai/duplodocus) by AllenAI
140
+ - [fastText](https://github.com/facebookresearch/fastText) by Facebook Research
141
+ - [COMET](https://huggingface.co/Unbabel/wmt22-comet-da) by Ricardo Rei et al.
142
+ - [sacrebleu](https://github.com/mjpost/sacrebleu) by Matt Post
143
+ - [Mitsuki Sakamoto](https://huggingface.co/Mitsuki-Sakamoto) for deploying the model with UI for internal testing
chat_template.jinja ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<s><|im_start|>system
2
+ ' + system_message + '<|im_end|>
3
+ '}}{% endif %}{{'<|im_start|>' + message['role'] + '
4
+ ' + message['content'] + '<|im_end|>' + '
5
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
6
+ ' }}{% endif %}
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MistralForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 1,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 61763,
9
+ "head_dim": null,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 14336,
14
+ "max_position_embeddings": 32768,
15
+ "model_type": "mistral",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 32,
18
+ "num_key_value_heads": 8,
19
+ "pad_token_id": 61762,
20
+ "rms_norm_eps": 1e-05,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "transformers_version": "4.57.1",
25
+ "use_cache": false,
26
+ "vocab_size": 61765
27
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 61763
7
+ ],
8
+ "pad_token_id": 61762,
9
+ "transformers_version": "4.57.1"
10
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d5dc03f9286330fb276b2947049066ef19255e779377b849fba3b5ceeaaf2ed
3
+ size 4952115968
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf44461125ec468943e1b5f713f01acf296225f13b5586642fae2f57edb9a7ec
3
+ size 4915916184
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75c7f579a0c22b4099974e806bae682f95cbf10a55158546697b578fe8a81e48
3
+ size 4597156656
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d5ebd00f85365e75ad4fe8ba4535723530acd124ef20ecca07122dc9e74f4e2
3
+ size 505979008
model.safetensors.index.json ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 7485566976,
4
+ "total_size": 14971133952
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
28
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
29
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
30
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
34
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
35
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
36
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
118
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
119
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
120
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
121
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
122
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
123
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
124
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
125
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
126
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
142
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
143
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
144
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
156
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
157
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
158
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
159
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
160
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
161
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
162
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
163
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
164
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
217
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
218
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
219
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
220
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
221
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
222
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
223
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
224
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
225
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
244
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
245
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
246
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
247
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
248
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
249
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
250
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
251
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
252
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
253
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
254
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
255
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
256
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
257
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
258
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
259
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
260
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
283
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
290
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
293
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
295
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
297
+ "model.norm.weight": "model-00003-of-00004.safetensors"
298
+ }
299
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|padding|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "<|im_start|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
test.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ model_name = "CyberAgent/CAT-Translate-7b"
4
+ chat_pipeline = pipeline("text-generation", model_name)
5
+
6
+ prompt = "Translate the following {src_lang} text into {tgt_lang}.\n\n {src_text}"
7
+
8
+ src_lang = "Japanese"
9
+ tgt_lang = "English"
10
+ src_text = "🐈はとてもかわいいの。おててがまるくてふわふわなの。"
11
+
12
+ user_input = [{"role": "user", "content": prompt.format(src_lang=src_lang, tgt_lang=tgt_lang, src_text=src_text)}]
13
+
14
+ response = chat_pipeline(user_input)
15
+
16
+ print("-" * 20)
17
+ print("Source Text:")
18
+ print(src_text)
19
+ print("Translation:")
20
+ print(response[0]['generated_text'][-1]['content'])
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "61762": {
31
+ "content": "<|padding|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "61763": {
39
+ "content": "<|im_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "61764": {
47
+ "content": "<|im_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ }
54
+ },
55
+ "additional_special_tokens": [],
56
+ "bos_token": "<s>",
57
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<s><|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
58
+ "clean_up_tokenization_spaces": false,
59
+ "eos_token": "<|im_end|>",
60
+ "extra_special_tokens": {},
61
+ "legacy": true,
62
+ "model_max_length": 1000000000000000019884624838656,
63
+ "pad_token": "<|padding|>",
64
+ "sep_token": "<|im_start|>",
65
+ "sp_model_kwargs": {},
66
+ "spaces_between_special_tokens": false,
67
+ "tokenizer_class": "LlamaTokenizerFast",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": false
70
+ }