AIDSC

TimeRobber commited on Oct 23, 2024

Commit

45e3df8

verified ·

0 Parent(s):

Duplicate from bigscience/mt0-xxl

Browse files

Co-authored-by: Thomas Wang <TimeRobber@users.noreply.huggingface.co>

Files changed (21) hide show

.gitattributes +33 -0
README.md +960 -0
config.json +32 -0
model-00001-of-00006.safetensors +3 -0
model-00002-of-00006.safetensors +3 -0
model-00003-of-00006.safetensors +3 -0
model-00004-of-00006.safetensors +3 -0
model-00005-of-00006.safetensors +3 -0
model-00006-of-00006.safetensors +3 -0
model.safetensors.index.json +566 -0
pytorch_model-00001-of-00006.bin +3 -0
pytorch_model-00002-of-00006.bin +3 -0
pytorch_model-00003-of-00006.bin +3 -0
pytorch_model-00004-of-00006.bin +3 -0
pytorch_model-00005-of-00006.bin +3 -0
pytorch_model-00006-of-00006.bin +3 -0
pytorch_model.bin.index.json +566 -0
special_tokens_map.json +5 -0
spiece.model +3 -0
tokenizer.json +3 -0
tokenizer_config.json +11 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,33 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,960 @@

+---
+datasets:
+- bigscience/xP3
+- mc4
+license: apache-2.0
+language:
+- af
+- am
+- ar
+- az
+- be
+- bg
+- bn
+- ca
+- ceb
+- co
+- cs
+- cy
+- da
+- de
+- el
+- en
+- eo
+- es
+- et
+- eu
+- fa
+- fi
+- fil
+- fr
+- fy
+- ga
+- gd
+- gl
+- gu
+- ha
+- haw
+- hi
+- hmn
+- ht
+- hu
+- hy
+- ig
+- is
+- it
+- iw
+- ja
+- jv
+- ka
+- kk
+- km
+- kn
+- ko
+- ku
+- ky
+- la
+- lb
+- lo
+- lt
+- lv
+- mg
+- mi
+- mk
+- ml
+- mn
+- mr
+- ms
+- mt
+- my
+- ne
+- nl
+- 'no'
+- ny
+- pa
+- pl
+- ps
+- pt
+- ro
+- ru
+- sd
+- si
+- sk
+- sl
+- sm
+- sn
+- so
+- sq
+- sr
+- st
+- su
+- sv
+- sw
+- ta
+- te
+- tg
+- th
+- tr
+- uk
+- und
+- ur
+- uz
+- vi
+- xh
+- yi
+- yo
+- zh
+- zu
+tags:
+- text2text-generation
+widget:
+- text: >-
+    <table> <tr> <th>Name</th> <th>Explanation</th> <th>Example models</th>
+    </tr> <tr> <td><a
+    href=https://huggingface.co/datasets/bigscience/xP3>xP3</a></t>  <td>Mixture
+    of 13 training tasks in 46 languages with English prompts</td> <td><a
+    href=https://huggingface.co/bigscience/bloomz>bloomz</a> & <a
+    href=https://huggingface.co/bigscience/mt0-xxl>mt0-xxl</a></td> </tr> <tr>
+    <td><a href=https://huggingface.co/datasets/bigscience/xP3mt>xP3mt</a></t>
+    <td>Mixture of 13 training tasks in 46 languages with prompts in 20
+    languages (machine-translated from English)</td> <td><a
+    href=https://huggingface.co/bigscience/bloomz-mt>bloomz-mt</a> & <a
+    href=https://huggingface.co/bigscience/mt0-xxl-mt>mt0-xxl-mt</a></td> </tr>
+    <tr> <td><a
+    href=https://huggingface.co/datasets/bigscience/xP3all>xP3all</a></t>
+    <td>xP3 + our evaluation datasets adding an additional 3 tasks for a total
+    of 16 tasks in 46 languages with English prompts</td> <td></td> </tr> <tr>
+    <td><a
+    href=https://huggingface.co/datasets/bigscience/xP3megds>xP3megds</a></t>
+    <td><a
+    href=https://github.com/bigscience-workshop/Megatron-DeepSpeed>Megatron-DeepSpeed</a>
+    processed version of xP3</td> <td><a
+    href=https://huggingface.co/bigscience/bloomz>bloomz</a></td> </tr> <tr>
+    <td><a href=https://huggingface.co/datasets/Muennighoff/P3>P3</a></t>
+    <td>Repreprocessed version of the English-only <a
+    href=https://huggingface.co/datasets/bigscience/P3>P3</a> with 8 training
+    tasks</td> <td><a
+    href=https://huggingface.co/bigscience/bloomz-p3>bloomz-p3</a> & <a
+    href=https://huggingface.co/bigscience/mt0-xxl-p3>mt0-xxl-p3</a></td> </tr>
+    </table> Which dataset has the most tasks?
+  example_title: en-en struct-to-text
+- text: Life is beautiful! Translate to Mongolian.
+  example_title: mn-en translation
+- text: Le mot japonais «憂鬱» veut dire quoi en Odia?
+  example_title: jp-or-fr translation
+- text: >-
+    Stell mir eine schwierige Quiz Frage bei der es um Astronomie geht. Bitte
+    stell die Frage auf Norwegisch.
+  example_title: de-nb quiz
+- text: >-
+    We present BLOOMZ & mT0, a family of models capable of following human
+    instructions in dozens of languages zero-shot. We finetune BLOOM & mT5
+    pretrained multilingual language models on our crosslingual task mixture
+    (xP3) and find our resulting models capable of crosslingual generalization
+    to unseen tasks & languages. What are the keywords in Chinese?
+  example_title: zh-en keywords
+- text: >-
+    一个传奇的开端，一个不灭的神话，这不仅仅是一部电影，而是作为一个走进新时代的标签，永远彪炳史册。Would you rate the previous
+    review as positive, neutral or negative?
+  example_title: zh-en sentiment
+- text: 一个传奇的开端，一个不灭的神话，这不仅仅是一部电影，而是作为一个走进新时代的标签，永远彪炳史册。你认为这句话的立场是赞扬、中立还是批评？
+  example_title: zh-zh sentiment
+- text: Suggest at least five related search terms to "Mạng neural nhân tạo".
+  example_title: vi-en query
+- text: >-
+    Proposez au moins cinq mots clés concernant «Réseau de neurones
+    artificiels».
+  example_title: fr-fr query
+- text: Explain in a sentence in Telugu what is backpropagation in neural networks.
+  example_title: te-en qa
+- text: Why is the sky blue?
+  example_title: en-en qa
+- text: >-
+    Write a fairy tale about a troll saving a princess from a dangerous dragon.
+    The fairy tale is a masterpiece that has achieved praise worldwide and its
+    moral is "Heroes Come in All Shapes and Sizes". Story (in Spanish):
+  example_title: es-en fable
+- text: >-
+    Write a fable about wood elves living in a forest that is suddenly invaded
+    by ogres. The fable is a masterpiece that has achieved praise worldwide and
+    its moral is "Violence is the last refuge of the incompetent". Fable (in
+    Hindi):
+  example_title: hi-en fable
+model-index:
+- name: mt0-xxl
+  results:
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: winogrande
+      name: Winogrande XL (xl)
+      config: xl
+      split: validation
+      revision: a80f460359d1e9a67c006011c94de42a8759430c
+    metrics:
+    - type: Accuracy
+      value: 63.38
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (en)
+      config: en
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 81.29
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (fr)
+      config: fr
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 78.31
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (jp)
+      config: jp
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 78.62
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (pt)
+      config: pt
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 77.95
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (ru)
+      config: ru
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 76.51
+  - task:
+      type: Coreference resolution
+    dataset:
+      type: Muennighoff/xwinograd
+      name: XWinograd (zh)
+      config: zh
+      split: test
+      revision: 9dd5ea5505fad86b7bedad667955577815300cee
+    metrics:
+    - type: Accuracy
+      value: 77.38
+  - task:
+      type: Natural language inference
+    dataset:
+      type: anli
+      name: ANLI (r1)
+      config: r1
+      split: validation
+      revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094
+    metrics:
+    - type: Accuracy
+      value: 49.5
+  - task:
+      type: Natural language inference
+    dataset:
+      type: anli
+      name: ANLI (r2)
+      config: r2
+      split: validation
+      revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094
+    metrics:
+    - type: Accuracy
+      value: 43
+  - task:
+      type: Natural language inference
+    dataset:
+      type: anli
+      name: ANLI (r3)
+      config: r3
+      split: validation
+      revision: 9dbd830a06fea8b1c49d6e5ef2004a08d9f45094
+    metrics:
+    - type: Accuracy
+      value: 46.08
+  - task:
+      type: Natural language inference
+    dataset:
+      type: super_glue
+      name: SuperGLUE (cb)
+      config: cb
+      split: validation
+      revision: 9e12063561e7e6c79099feb6d5a493142584e9e2
+    metrics:
+    - type: Accuracy
+      value: 85.71
+  - task:
+      type: Natural language inference
+    dataset:
+      type: super_glue
+      name: SuperGLUE (rte)
+      config: rte
+      split: validation
+      revision: 9e12063561e7e6c79099feb6d5a493142584e9e2
+    metrics:
+    - type: Accuracy
+      value: 85.56
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (ar)
+      config: ar
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 57.91
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (bg)
+      config: bg
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 59.88
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (de)
+      config: de
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 60.64
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (el)
+      config: el
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 59
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (en)
+      config: en
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 62.01
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (es)
+      config: es
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 60.8
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (fr)
+      config: fr
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 59.88
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (hi)
+      config: hi
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 57.23
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (ru)
+      config: ru
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 58.88
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (sw)
+      config: sw
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 55.66
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (th)
+      config: th
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 57.43
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (tr)
+      config: tr
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 57.59
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (ur)
+      config: ur
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 55.42
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (vi)
+      config: vi
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 58.51
+  - task:
+      type: Natural language inference
+    dataset:
+      type: xnli
+      name: XNLI (zh)
+      config: zh
+      split: validation
+      revision: a5a45e4ff92d5d3f34de70aaf4b72c3bdf9f7f16
+    metrics:
+    - type: Accuracy
+      value: 59.12
+  - task:
+      type: Sentence completion
+    dataset:
+      type: story_cloze
+      name: StoryCloze (2016)
+      config: '2016'
+      split: validation
+      revision: e724c6f8cdf7c7a2fb229d862226e15b023ee4db
+    metrics:
+    - type: Accuracy
+      value: 96.04
+  - task:
+      type: Sentence completion
+    dataset:
+      type: super_glue
+      name: SuperGLUE (copa)
+      config: copa
+      split: validation
+      revision: 9e12063561e7e6c79099feb6d5a493142584e9e2
+    metrics:
+    - type: Accuracy
+      value: 93
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (et)
+      config: et
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 79
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (ht)
+      config: ht
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 81
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (id)
+      config: id
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 92
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (it)
+      config: it
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 90
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (qu)
+      config: qu
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 59
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (sw)
+      config: sw
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 79
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (ta)
+      config: ta
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 84
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (th)
+      config: th
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 77
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (tr)
+      config: tr
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 79
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (vi)
+      config: vi
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 88
+  - task:
+      type: Sentence completion
+    dataset:
+      type: xcopa
+      name: XCOPA (zh)
+      config: zh
+      split: validation
+      revision: 37f73c60fb123111fa5af5f9b705d0b3747fd187
+    metrics:
+    - type: Accuracy
+      value: 89
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (ar)
+      config: ar
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 91.07
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (es)
+      config: es
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 92.52
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (eu)
+      config: eu
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 90.6
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (hi)
+      config: hi
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 92.32
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (id)
+      config: id
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 93.51
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (my)
+      config: my
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 87.49
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (ru)
+      config: ru
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 91.4
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (sw)
+      config: sw
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 89.41
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (te)
+      config: te
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 90.54
+  - task:
+      type: Sentence completion
+    dataset:
+      type: Muennighoff/xstory_cloze
+      name: XStoryCloze (zh)
+      config: zh
+      split: validation
+      revision: 8bb76e594b68147f1a430e86829d07189622b90d
+    metrics:
+    - type: Accuracy
+      value: 93.85
+pipeline_tag: text2text-generation
+---
+![xmtf](https://github.com/bigscience-workshop/xmtf/blob/master/xmtf_banner.png?raw=true)
+#  Table of Contents
+1. [Model Summary](#model-summary)
+2. [Use](#use)
+3. [Limitations](#limitations)
+4. [Training](#training)
+5. [Evaluation](#evaluation)
+7. [Citation](#citation)
+# Model Summary
+> We present BLOOMZ & mT0, a family of models capable of following human instructions in dozens of languages zero-shot. We finetune BLOOM & mT5 pretrained multilingual language models on our crosslingual task mixture (xP3) and find our resulting models capable of crosslingual generalization to unseen tasks & languages.
+- **Repository:** [bigscience-workshop/xmtf](https://github.com/bigscience-workshop/xmtf)
+- **Paper:** [Crosslingual Generalization through Multitask Finetuning](https://arxiv.org/abs/2211.01786)
+- **Point of Contact:** [Niklas Muennighoff](mailto:niklas@hf.co)
+- **Languages:** Refer to [mc4](https://huggingface.co/datasets/mc4) for pretraining & [xP3](https://huggingface.co/bigscience/xP3) for finetuning language proportions. It understands both pretraining & finetuning languages.
+- **BLOOMZ & mT0 Model Family:**
+<div class="max-w-full overflow-auto">
+<table>
+  <tr>
+<th colspan="12">Multitask finetuned on <a style="font-weight:bold" href=https://huggingface.co/datasets/bigscience/xP3>xP3</a>. Recommended for prompting in English.
+</tr>
+<tr>
+<td>Parameters</td>
+<td>300M</td>
+<td>580M</td>
+<td>1.2B</td>
+<td>3.7B</td>
+<td>13B</td>
+<td>560M</td>
+<td>1.1B</td>
+<td>1.7B</td>
+<td>3B</td>
+<td>7.1B</td>
+<td>176B</td>
+</tr>
+<tr>
+<td>Finetuned Model</td>
+<td><a href=https://huggingface.co/bigscience/mt0-small>mt0-small</a></td>
+<td><a href=https://huggingface.co/bigscience/mt0-base>mt0-base</a></td>
+<td><a href=https://huggingface.co/bigscience/mt0-large>mt0-large</a></td>
+<td><a href=https://huggingface.co/bigscience/mt0-xl>mt0-xl</a></td>
+<td><a href=https://huggingface.co/bigscience/mt0-xxl>mt0-xxl</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-560m>bloomz-560m</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-1b1>bloomz-1b1</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-1b7>bloomz-1b7</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-3b>bloomz-3b</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-7b1>bloomz-7b1</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz>bloomz</a></td>
+</tr>
+</tr>
+  <tr>
+<th colspan="12">Multitask finetuned on <a style="font-weight:bold" href=https://huggingface.co/datasets/bigscience/xP3mt>xP3mt</a>. Recommended for prompting in non-English.</th>
+</tr>
+<tr>
+<td>Finetuned Model</td>
+<td></td>
+<td></td>
+<td></td>
+<td></td>
+<td><a href=https://huggingface.co/bigscience/mt0-xxl-mt>mt0-xxl-mt</a></td>
+<td></td>
+<td></td>
+<td></td>
+<td></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-7b1-mt>bloomz-7b1-mt</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-mt>bloomz-mt</a></td>
+</tr>
+<th colspan="12">Multitask finetuned on <a style="font-weight:bold" href=https://huggingface.co/datasets/Muennighoff/P3>P3</a>. Released for research purposes only. Strictly inferior to above models!</th>
+</tr>
+<tr>
+<td>Finetuned Model</td>
+<td></td>
+<td></td>
+<td></td>
+<td></td>
+<td><a href=https://huggingface.co/bigscience/mt0-xxl-p3>mt0-xxl-p3</a></td>
+<td></td>
+<td></td>
+<td></td>
+<td></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-7b1-p3>bloomz-7b1-p3</a></td>
+<td><a href=https://huggingface.co/bigscience/bloomz-p3>bloomz-p3</a></td>
+</tr>
+<th colspan="12">Original pretrained checkpoints. Not recommended.</th>
+<tr>
+<td>Pretrained Model</td>
+<td><a href=https://huggingface.co/google/mt5-small>mt5-small</a></td>
+<td><a href=https://huggingface.co/google/mt5-base>mt5-base</a></td>
+<td><a href=https://huggingface.co/google/mt5-large>mt5-large</a></td>
+<td><a href=https://huggingface.co/google/mt5-xl>mt5-xl</a></td>
+<td><a href=https://huggingface.co/google/mt5-xxl>mt5-xxl</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom-560m>bloom-560m</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom-1b1>bloom-1b1</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom-1b7>bloom-1b7</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom-3b>bloom-3b</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom-7b1>bloom-7b1</a></td>
+<td><a href=https://huggingface.co/bigscience/bloom>bloom</a></td>
+</tr>
+</table>
+</div>
+# Use
+## Intended use
+We recommend using the model to perform tasks expressed in natural language. For example, given the prompt "*Translate to English: Je t’aime.*", the model will most likely answer "*I love you.*". Some prompt ideas from our paper:
+- 一个传奇的开端，一个不灭的神话，这不仅仅是一部电影，而是作为一个走进新时代的标签，永远彪炳史册。你认为这句话的立场是赞扬、中立还是批评?
+- Suggest at least five related search terms to "Mạng neural nhân tạo".
+- Write a fairy tale about a troll saving a princess from a dangerous dragon. The fairy tale is a masterpiece that has achieved praise worldwide and its moral is "Heroes Come in All Shapes and Sizes". Story (in Spanish):
+- Explain in a sentence in Telugu what is backpropagation in neural networks.
+**Feel free to share your generations in the Community tab!**
+## How to use
+### CPU
+<details>
+<summary> Click to expand </summary>
+```python
+# pip install -q transformers
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+checkpoint = "bigscience/mt0-xxl"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt")
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+### GPU
+<details>
+<summary> Click to expand </summary>
+```python
+# pip install -q transformers accelerate
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+checkpoint = "bigscience/mt0-xxl"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
+inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+### GPU in 8bit
+<details>
+<summary> Click to expand </summary>
+```python
+# pip install -q transformers accelerate bitsandbytes
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+checkpoint = "bigscience/mt0-xxl"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="auto", load_in_8bit=True)
+inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+</details>
+<!-- Necessary for whitespace -->
+###
+# Limitations
+**Prompt Engineering:** The performance may vary depending on the prompt. For BLOOMZ models, we recommend making it very clear when the input stops to avoid the model trying to continue it. For example, the prompt "*Translate to English: Je t'aime*" without the full stop (.) at the end, may result in the model trying to continue the French sentence. Better prompts are e.g. "*Translate to English: Je t'aime.*", "*Translate to English: Je t'aime. Translation:*" "*What is "Je t'aime." in English?*", where it is clear for the model when it should answer. Further, we recommend providing the model as much context as possible. For example, if you want it to answer in Telugu, then tell the model, e.g. "*Explain in a sentence in Telugu what is backpropagation in neural networks.*".
+# Training
+## Model
+- **Architecture:** Same as [mt5-xxl](https://huggingface.co/google/mt5-xxl), also refer to the `config.json` file
+- **Finetuning steps:** 7000
+- **Finetuning tokens:** 1.29 billion
+- **Precision:** bfloat16
+## Hardware
+- **TPUs:** TPUv4-256
+## Software
+- **Orchestration:** [T5X](https://github.com/google-research/t5x)
+- **Neural networks:** [Jax](https://github.com/google/jax)
+# Evaluation
+We refer to Table 7 from our [paper](https://arxiv.org/abs/2211.01786) & [bigscience/evaluation-results](https://huggingface.co/datasets/bigscience/evaluation-results) for zero-shot results on unseen tasks. The sidebar reports zero-shot performance of the best prompt per dataset config.
+# Citation
+```bibtex
+@article{muennighoff2022crosslingual,
+  title={Crosslingual generalization through multitask finetuning},
+  author={Muennighoff, Niklas and Wang, Thomas and Sutawika, Lintang and Roberts, Adam and Biderman, Stella and Scao, Teven Le and Bari, M Saiful and Shen, Sheng and Yong, Zheng-Xin and Schoelkopf, Hailey and others},
+  journal={arXiv preprint arXiv:2211.01786},
+  year={2022}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "google/mt5-xxl",
+  "architectures": [
+    "MT5ForConditionalGeneration"
+  ],
+  "d_ff": 10240,
+  "d_kv": 64,
+  "d_model": 4096,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "mt5",
+  "num_decoder_layers": 24,
+  "num_heads": 64,
+  "num_layers": 24,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "tokenizer_class": "T5Tokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.23.1",
+  "use_cache": true,
+  "vocab_size": 250112
+}

model-00001-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d248e71582e351642d5e44e381e9be233ee2b04eb5833a7603d69ed51ac39585
+size 9936568872

model-00002-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eda538fe84abd7cab450a68c69a76588dc7a6c34ddc2418b6ab0e514bcdc0952
+size 9865443384

model-00003-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:881ec52a2544492e0a96949cef398ba8c9b2aa37a33618a0c37f418fa259a94e
+size 9869476808

model-00004-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fee75c951a13df924f0d496087fb477c4c72ac1c6a26f4b860ceaa825014cb87
+size 9999712688

model-00005-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:644f87dc1851d7c073cb46747fb0fb72738174e93ed0e07fd7c8e6aeca002f3d
+size 9999712416

model-00006-of-00006.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:287e19e2e0b91211df7cf90d8760e60572ca04b1ada82110e582a4581baa9255
+size 6111219176

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,566 @@

+{
+    "metadata": {
+        "total_size": 55782064128
+    },
+    "weight_map": {
+        "decoder.block.0.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.k.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.o.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.q.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.1.EncDecAttention.v.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.2.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.0.layer.2.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.k.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.o.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.q.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.1.EncDecAttention.v.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.2.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.1.layer.2.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.10.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.11.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.12.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.12.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.12.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.12.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.12.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.12.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.13.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.14.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.15.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.16.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.17.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.18.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.19.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.k.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.o.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.q.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.1.EncDecAttention.v.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
+        "decoder.block.2.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.2.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.20.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.0.SelfAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.0.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.k.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.o.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.1.EncDecAttention.v.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.1.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.2.DenseReluDense.wo.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.21.layer.2.layer_norm.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.k.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.o.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.q.weight": "model-00005-of-00006.safetensors",
+        "decoder.block.22.layer.0.SelfAttention.v.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.0.layer_norm.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.k.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.o.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.q.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.1.EncDecAttention.v.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.1.layer_norm.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.2.DenseReluDense.wo.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.22.layer.2.layer_norm.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.k.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.o.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.q.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.0.SelfAttention.v.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.0.layer_norm.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.k.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.o.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.q.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.1.EncDecAttention.v.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.1.layer_norm.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.2.DenseReluDense.wo.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.23.layer.2.layer_norm.weight": "model-00006-of-00006.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.3.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.4.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.5.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.6.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.7.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.8.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.0.SelfAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.0.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.k.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.o.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.q.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.1.EncDecAttention.v.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.1.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.2.DenseReluDense.wo.weight": "model-00004-of-00006.safetensors",
+        "decoder.block.9.layer.2.layer_norm.weight": "model-00004-of-00006.safetensors",
+        "decoder.embed_tokens.weight": "model-00003-of-00006.safetensors",
+        "decoder.final_layer_norm.weight": "model-00006-of-00006.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.0.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.0.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.1.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.1.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.1.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.1.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.10.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.10.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.10.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.10.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.11.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.11.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.11.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.11.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.12.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.12.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.12.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.12.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.13.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.13.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.13.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.13.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.14.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.14.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.14.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.14.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.15.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.15.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.15.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.15.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.16.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.16.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.16.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.16.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.17.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.17.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.17.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.17.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.18.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.18.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.18.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.18.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.19.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.19.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.19.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.19.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.2.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.2.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.2.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.2.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.20.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.20.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.20.layer.1.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.20.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.21.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.21.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.21.layer.1.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.21.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.22.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.22.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.22.layer.1.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.22.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.k.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.o.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.q.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.23.layer.0.SelfAttention.v.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.23.layer.0.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.23.layer.1.DenseReluDense.wo.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.23.layer.1.layer_norm.weight": "model-00003-of-00006.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.3.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.3.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.3.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.3.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.4.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.4.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.4.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.4.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.5.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.5.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.5.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.5.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.6.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.6.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.6.layer.1.DenseReluDense.wo.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.6.layer.1.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.k.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.o.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.q.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.7.layer.0.SelfAttention.v.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.7.layer.0.layer_norm.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "model-00001-of-00006.safetensors",
+        "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.7.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.7.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.8.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.8.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.8.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.8.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.k.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.o.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.q.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.9.layer.0.SelfAttention.v.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.9.layer.0.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00002-of-00006.safetensors",
+        "encoder.block.9.layer.1.layer_norm.weight": "model-00002-of-00006.safetensors",
+        "encoder.final_layer_norm.weight": "model-00003-of-00006.safetensors",
+        "lm_head.weight": "model-00006-of-00006.safetensors",
+        "shared.weight": "model-00001-of-00006.safetensors"
+    }
+}

pytorch_model-00001-of-00006.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:295a276775d79359cfd243bd93c9e2c408a8e33718e5bee1d05625f026af6175
+size 9936583612

pytorch_model-00002-of-00006.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c21533a6182886bec48cd0190952b3c5e71224873234135c2754f7c81d02ac82
+size 9865466989

pytorch_model-00003-of-00006.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62cc874eb7f5cfa6fcbde4a19bab7de1f7bf8b47f0f01c45713927115c85a153
+size 9869491791

pytorch_model-00004-of-00006.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36b2a5945f7c037b99eaf5ed891fc158b23a791b92042861a5298b0c8ec224be
+size 9999740653

pytorch_model-00005-of-00006.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f769732a1c4ba3a9cbd9ea1c2701ade3cdf2a35f73e75ac77d0c26788a5d88f
+size 9999739675

pytorch_model-00006-of-00006.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92679f99746d0e1082d7407091cb7f2a588d49b9bf13724f706e8912f86c5786
+size 6111224758

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,566 @@

+{
+  "metadata": {
+    "total_size": 55782064128
+  },
+  "weight_map": {
+    "decoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.0.layer.2.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.2.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.1.layer.2.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.10.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.11.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.12.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.12.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.12.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.12.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.12.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.12.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.13.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.14.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.15.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.16.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.17.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.18.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.19.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.1.EncDecAttention.k.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.1.EncDecAttention.o.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.1.EncDecAttention.q.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.1.EncDecAttention.v.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.block.2.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.2.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.20.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.1.EncDecAttention.k.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.1.EncDecAttention.o.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.1.EncDecAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.1.EncDecAttention.v.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.2.DenseReluDense.wo.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.21.layer.2.layer_norm.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00005-of-00006.bin",
+    "decoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.1.EncDecAttention.k.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.1.EncDecAttention.o.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.1.EncDecAttention.q.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.1.EncDecAttention.v.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.2.DenseReluDense.wo.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.22.layer.2.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.1.EncDecAttention.k.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.1.EncDecAttention.o.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.1.EncDecAttention.q.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.1.EncDecAttention.v.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.2.DenseReluDense.wo.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.23.layer.2.layer_norm.weight": "pytorch_model-00006-of-00006.bin",
+    "decoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.3.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.4.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.5.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.6.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.7.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.8.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.1.EncDecAttention.k.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.1.EncDecAttention.o.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.1.EncDecAttention.q.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.1.EncDecAttention.v.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.2.DenseReluDense.wo.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.block.9.layer.2.layer_norm.weight": "pytorch_model-00004-of-00006.bin",
+    "decoder.embed_tokens.weight": "pytorch_model-00003-of-00006.bin",
+    "decoder.final_layer_norm.weight": "pytorch_model-00006-of-00006.bin",
+    "encoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.0.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.1.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.10.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.11.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.12.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.13.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.14.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.15.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.16.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.17.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.18.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.19.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.2.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.20.layer.1.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.21.layer.1.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.22.layer.1.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.23.layer.1.DenseReluDense.wo.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "encoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.3.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.4.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.5.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.6.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00006.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.7.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.8.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.9.layer.1.DenseReluDense.wo.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00002-of-00006.bin",
+    "encoder.final_layer_norm.weight": "pytorch_model-00003-of-00006.bin",
+    "lm_head.weight": "pytorch_model-00006-of-00006.bin",
+    "shared.weight": "pytorch_model-00001-of-00006.bin"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
+size 4309802

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93c3578052e1605d8332eb961bc08d72e246071974e4cc54aa6991826b802aa5
+size 16330369

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "additional_special_tokens": null,
+  "eos_token": "</s>",
+  "extra_ids": 0,
+  "name_or_path": "google/mt5-xxl",
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": "/home/patrick/.cache/torch/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276",
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}