ojo commited on Sep 26, 2025

Commit

8c0fb70

verified ·

1 Parent(s): 2c4294c

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

.gitattributes +1 -0
LICENSE +3 -0
README.md +71 -3
added_tokens.json +608 -0
all_results.json +8 -0
chat_template.jinja +1 -0
config.json +32 -0
generation_config.json +9 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +295 -0
special_tokens_map.json +30 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0
train_results.json +8 -0
trainer_log.jsonl +0 -0
trainer_state.json +0 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ Copyright (C) 2024 Xiaomi Corporation.
2	+
3	+ Licensed under the [Gemma](https://ai.google.dev/gemma/terms).

README.md CHANGED Viewed

@@ -1,3 +1,71 @@
----
-license: apache-2.0
----

+---
+license: gemma
+license_name: license
+license_link: LICENSE
+base_model:
+- ModelSpace/GemmaX2-28-2B-v0.1
+pipeline_tag: translation
+library_name: transformers
+tags:
+- text-generation
+language:
+- de
+- en
+- fr
+- es
+---
+## Model Summary
+OLaPh is a large language model for phonemization, finetuned from GemmaX2-28-2B-v0.1.
+Its tokenizer was extended with 1,024 phoneme tokens, derived from a BPE tokenizer trained on phoneme sequences generated by the OLaPh framework (to be released).
+The model was then finetuned for grapheme-to-phoneme conversion on a multilingual dataset (English, German, French, Spanish), created by phonemizing text from HuggingFaceFW/fineweb and HuggingFaceFW/fineweb-2 using the OLaPh framework.
+- **Finetuned By **: Institute for Information Systems of Hof University
+- **Model type**: Text-To-Text
+- **Language(s)**: English, French, German, Spanish
+- **License**: Gemma (Gemma is provided under and subject to the Gemma Terms of Use found at ai.google.dev/gemma/terms)
+- **Release Date**: September 25, 2025
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+lang = "English" #German, French, Spanish
+sentence = "But we are not sorry, for the rain is delightful."
+model_id = "iisys-hof/olaph"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id).to("cuda")
+stop_tokens = [tokenizer.eos_token_id, tokenizer.encode(".", add_special_tokens=False)[0]]
+prompt =  f"Translate this from {lang} to Phones:\n{lang}: "
+inputs = tokenizer(f"{prompt}{sentence}\nPhones:", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, max_new_tokens=256, eos_token_id=stop_tokens)
+phonemized = tokenizer.decode(outputs[0], skip_special_tokens=False)
+phonemized = phonemized.split("\n")[-1].replace("Phones:", "")
+print(phonemized)
+```
+#Caveats
+### Citation
+```bibtex
+@misc{wirth2025olaphoptimallanguagephonemizer,
+      title={OLaPh: Optimal Language Phonemizer},
+      author={Johannes Wirth},
+      year={2025},
+      eprint={2509.20086},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2509.20086},
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,608 @@

+{
+  " avɛ": 256208,
+  " avɛk": 256343,
+  " aʁ": 256368,
+  " aʊ̯": 256073,
+  " aʊ̯f": 256165,
+  " aʊ̯s": 256266,
+  " aʊ̯x": 256211,
+  " baɪ̯": 256333,
+  " bjɛ̃": 256478,
+  " bə": 256075,
+  " bɪk": 256505,
+  " bɪs": 256533,
+  " bɹ": 256256,
+  " dɑ̃": 256175,
+  " dɔx": 256523,
+  " dɔ̃": 256509,
+  " dɛn": 256601,
+  " dɛs": 256199,
+  " dɪs": 256361,
+  " dɹ": 256404,
+  " dʊʁç": 256413,
+  " d‍ʒ": 256188,
+  " etɛ": 256277,
+  " fwe": 256511,
+  " fɔn": 256146,
+  " fɔʁ": 256507,
+  " fɛ": 256513,
+  " fɛɐ̯": 256091,
+  " fɛʁ": 256545,
+  " fɹ": 256144,
+  " fʁ": 256388,
+  " gʁ": 256389,
+  " kaʁ": 256458,
+  " komo": 256320,
+  " kɔ": 256599,
+  " kɔm": 256184,
+  " kɔ̃": 256135,
+  " kə": 256085,
+  " kəm": 256311,
+  " kən": 256461,
+  " kəns": 256600,
+  " kɛl": 256517,
+  " kɹ": 256315,
+  " lœʁ": 256459,
+  " lə": 256062,
+  " lɥi": 256345,
+  " maʁ": 256393,
+  " mwa": 256563,
+  " mɔ̃": 256330,
+  " mɛ": 256227,
+  " mɛm": 256550,
+  " mɪt": 256147,
+  " mɪç": 256407,
+  " nɔx": 256324,
+  " nɔ̃": 256498,
+  " nə": 256224,
+  " nɪçt": 256140,
+  " otʁ": 256602,
+  " paɾ": 256192,
+  " paɾa": 256291,
+  " paʁ": 256111,
+  " peɾ": 256501,
+  " peɾo": 256548,
+  " poɾ": 256177,
+  " puʁ": 256196,
+  " pø": 256521,
+  " pɔ": 256568,
+  " pə": 256514,
+  " pɛʁ": 256328,
+  " pɹ": 256142,
+  " pɹə": 256394,
+  " pɾi": 256540,
+  " pɾo": 256321,
+  " pʁ": 256103,
+  " pʁe": 256444,
+  " pʁɔ": 256284,
+  " seɲ": 256604,
+  " stɹ": 256334,
+  " syʁ": 256235,
+  " sɑ̃": 256289,
+  " sɔ̃": 256172,
+  " sɛ": 256373,
+  " sɛt": 256258,
+  " sɛʁ": 256580,
+  " sɛ̃": 256453,
+  " s̪": 256183,
+  " tje": 256520,
+  " tɑ̃": 256574,
+  " tɹ": 256202,
+  " tɾa": 256552,
+  " tʁ": 256236,
+  " tʁa": 256484,
+  " t͡s": 256049,
+  " t͡su": 256074,
+  " t͡sʊm": 256490,
+  " t‍ʃ": 256226,
+  " vwa": 256588,
+  " vɛn": 256375,
+  " vɪ": 256405,
+  " vɪɐ̯t": 256406,
+  " zaɪ̯n": 256367,
+  " zɪnt": 256359,
+  " zɪç": 256143,
+  " ðə": 256007,
+  " œ̃": 256121,
+  " ɐ": 256057,
+  " ɐb": 256383,
+  " ɐd": 256587,
+  " ɐl": 256559,
+  " ɐɡ": 256472,
+  " ɑ̃": 256056,
+  " ɑ̃tʁ": 256581,
+  " ɔ": 256210,
+  " ɔʁ": 256370,
+  " ɔ̃": 256254,
+  " ənt": 256482,
+  " əp": 256538,
+  " ɛks": 256259,
+  " ɛl": 256193,
+  " ɛnt": 256279,
+  " ɛs": 256116,
+  " ɛst": 256101,
+  " ɛtʁ": 256554,
+  " ɛɐ̯": 256145,
+  " ɛ̃": 256162,
+  " ɡ": 256016,
+  " ɡl": 256582,
+  " ɡə": 256052,
+  " ɡɹ": 256214,
+  " ɪm": 256158,
+  " ɪn": 256053,
+  " ɪst": 256138,
+  " ɪç": 256120,
+  " ɹ": 256039,
+  " ɹɪs": 256547,
+  " ʁ": 256042,
+  " ʁa": 256419,
+  " ʁe": 256185,
+  " ʁi": 256596,
+  " ʁə": 256218,
+  " ʃ": 256031,
+  " ʃo": 256440,
+  " ʃta": 256510,
+  " ʊm": 256310,
+  " ʊn": 256414,
+  " ʊns": 256593,
+  " ʊnt": 256035,
+  " ʌn": 256519,
+  " ʒ": 256079,
+  " ʒə": 256215,
+  " ʝ": 256391,
+  " ʼ": 256428,
+  " ˈ": 256000,
+  " ˌ": 256064,
+  "alə": 256603,
+  "ant͡s": 256409,
+  "asjɔ̃": 256247,
+  "að": 256067,
+  "aða": 256305,
+  "aðo": 256213,
+  "aðos": 256549,
+  "aɪ̯": 256004,
+  "aɪ̯l": 256357,
+  "aɪ̯n": 256018,
+  "aɪ̯nɐ": 256249,
+  "aɪ̯nə": 256114,
+  "aɪ̯nəm": 256293,
+  "aɪ̯nən": 256230,
+  "aɪ̯nəs": 256558,
+  "aɪ̯t": 256137,
+  "aɾ": 256043,
+  "aʁ": 256026,
+  "aʊ̯": 256083,
+  "aʊ̯f": 256312,
+  "aʊ̯s": 256245,
+  "a‍": 256047,
+  "a‍ɪ": 256009,
+  "a‍ɪd": 256233,
+  "a‍ɪf": 256522,
+  "a‍ɪk": 256402,
+  "a‍ɪl": 256355,
+  "a‍ɪm": 256319,
+  "a‍ɪn": 256322,
+  "a‍ɪnd": 256358,
+  "a‍ɪt": 256168,
+  "a‍ɪv": 256515,
+  "a‍ɪz": 256381,
+  "a‍ɪ‍ə": 256282,
+  "a‍ʊ": 256051,
+  "a‍ʊn": 256339,
+  "a‍ʊnd": 256372,
+  "a‍ʊt": 256186,
+  "a‍ʊ‍ə": 256349,
+  "baɪ̯": 256283,
+  "bm̩": 256278,
+  "bn̩": 256171,
+  "bɐ": 256094,
+  "bə": 256159,
+  "bə‍l": 256222,
+  "bɝ": 256429,
+  "bɹ": 256539,
+  "bʁ": 256248,
+  "dn̩": 256131,
+  "dɐ": 256136,
+  "dɔɪ̯": 256592,
+  "dən": 256351,
+  "dɛ": 256543,
+  "dɪŋ": 256433,
+  "dʁ": 256356,
+  "dʒ": 256187,
+  "d‍ʒ": 256197,
+  "eɪ": 256027,
+  "eɾ": 256033,
+  "eɾa": 256366,
+  "eɾo": 256273,
+  "e‍ə": 256077,
+  "e‍ɪ": 256011,
+  "e‍ɪd": 256264,
+  "e‍ɪk": 256250,
+  "e‍ɪl": 256557,
+  "e‍ɪm": 256220,
+  "e‍ɪn": 256296,
+  "e‍ɪnd": 256398,
+  "e‍ɪnd‍ʒ": 256566,
+  "e‍ɪs": 256287,
+  "e‍ɪt": 256128,
+  "e‍ɪtɪd": 256382,
+  "e‍ɪv": 256526,
+  "e‍ɪʃən": 256167,
+  "fn̩": 256269,
+  "ftɐ": 256481,
+  "fɔl": 256387,
+  "fɔʁ": 256465,
+  "fə": 256497,
+  "fɛɐ̯": 256524,
+  "fʁ": 256426,
+  "hatə": 256395,
+  "haɪ̯": 256508,
+  "haɪ̯t": 256493,
+  "hæv": 256191,
+  "ið": 256238,
+  "iðo": 256445,
+  "iɾ": 256489,
+  "i‍ə": 256126,
+  "jeɾ": 256295,
+  "jœʁ": 256589,
+  "jɔ̃": 256098,
+  "jɛ": 256534,
+  "jɛʁ": 256288,
+  "jɛ̃": 256200,
+  "jʊ": 256309,
+  "kaɪ̯t": 256492,
+  "ktə": 256265,
+  "kœ": 256512,
+  "kɑ": 256438,
+  "kɔ": 256401,
+  "kə": 256263,
+  "kɹ": 256317,
+  "laŋ": 256487,
+  "laɪ̯": 256239,
+  "laɪ̯ç": 256571,
+  "laʊ̯": 256441,
+  "lɐ": 256415,
+  "lən": 256272,
+  "lɛ": 256344,
+  "lɪç": 256088,
+  "lɪçn̩": 256443,
+  "lɪçə": 256494,
+  "l̩": 256130,
+  "maɪ̯": 256584,
+  "mɑ̃": 256113,
+  "mən": 256152,
+  "mənt": 256243,
+  "mɛ": 256286,
+  "mɛn": 256466,
+  "mɪt": 256304,
+  "m̩": 256205,
+  "ndn̩": 256451,
+  "ndɐ": 256298,
+  "ndə": 256237,
+  "noʊ": 256274,
+  "ntə": 256377,
+  "nt͡s": 256369,
+  "nɐ": 256595,
+  "nə": 256209,
+  "nən": 256189,
+  "nəs": 256442,
+  "nɛ": 256446,
+  "nɪk": 256486,
+  "n̩": 256005,
+  "oɾ": 256048,
+  "oɾa": 256480,
+  "oʊ": 256078,
+  "pə‍l": 256555,
+  "pɹ": 256225,
+  "pʁ": 256569,
+  "pʁi": 256536,
+  "p͡f": 256416,
+  "sjɔ̃": 256448,
+  "sn̩": 256280,
+  "stn̩": 256260,
+  "stɐ": 256410,
+  "stə": 256240,
+  "tn̩": 256076,
+  "tɐ": 256100,
+  "tɑ̃": 256585,
+  "tə": 256024,
+  "tən": 256503,
+  "tət": 256462,
+  "tɛ": 256161,
+  "tɛʁ": 256570,
+  "tɪd": 256470,
+  "tʁ": 256072,
+  "tʁa": 256474,
+  "tʃ": 256149,
+  "tʃa": 256542,
+  "tʃo": 256464,
+  "t͡": 256430,
+  "t͡s": 256028,
+  "t͡si": 256198,
+  "t͡si̯o": 256340,
+  "t͡sn̩": 256537,
+  "t͡su": 256253,
+  "t‍ʃ": 256216,
+  "uʁ": 256099,
+  "vaɪ̯": 256180,
+  "vɑ̃": 256544,
+  "vɔl": 256467,
+  "vɔʁ": 256516,
+  "vən": 256551,
+  "vɛ": 256097,
+  "vɛl": 256432,
+  "vɛʁ": 256166,
+  "vɛʁdn̩": 256352,
+  "vɪs": 256535,
+  "vɪʁ": 256562,
+  "vʊʁ": 256275,
+  "vʊʁdə": 256376,
+  "waʁ": 256255,
+  "weɾ": 256431,
+  "wɛ̃": 256418,
+  "wɝ": 256203,
+  "xn̩": 256485,
+  "xtə": 256576,
+  "yʁ": 256125,
+  "zaɪ̯nə": 256457,
+  "zn̩": 256262,
+  "zɐ": 256353,
+  "zɔ": 256567,
+  "zɔl": 256437,
+  "zə": 256182,
+  "zɛ": 256400,
+  "ækt": 256338,
+  "æm": 256299,
+  "æp": 256403,
+  "æz": 256095,
+  "æŋ": 256528,
+  "æɫ": 256553,
+  "æɹ": 256412,
+  "çn̩": 256374,
+  "çt": 256323,
+  "çə": 256439,
+  "ðað": 256341,
+  "ðe": 256297,
+  "ðo": 256153,
+  "ðos": 256495,
+  "ðɐ": 256190,
+  "ŋg": 256469,
+  "ŋk": 256155,
+  "œl": 256556,
+  "œʁ": 256106,
+  "œ̃": 256112,
+  "ɐn": 256117,
+  "ɐ̯": 256002,
+  "ɐ̯t": 256169,
+  "ɐ̯tə": 256477,
+  "ɑɹ": 256115,
+  "ɑ̃": 256006,
+  "ɑ̃d": 256217,
+  "ɑ̃s": 256148,
+  "ɑ̃t": 256246,
+  "ɒd": 256506,
+  "ɒf": 256435,
+  "ɒl": 256160,
+  "ɒls": 256348,
+  "ɒlsə‍ʊ": 256362,
+  "ɒm": 256179,
+  "ɒn": 256063,
+  "ɒnt": 256578,
+  "ɒp": 256360,
+  "ɒt": 256092,
+  "ɒv": 256022,
+  "ɒz": 256080,
+  "ɒŋ": 256327,
+  "ɒɹ": 256499,
+  "ɔk": 256560,
+  "ɔl": 256084,
+  "ɔm": 256082,
+  "ɔmən": 256463,
+  "ɔn": 256065,
+  "ɔs": 256436,
+  "ɔt": 256527,
+  "ɔx": 256173,
+  "ɔɪ̯": 256086,
+  "ɔɹ": 256229,
+  "ɔʁ": 256038,
+  "ɔʁt": 256223,
+  "ɔ̃": 256014,
+  "ɔ‍ɪ": 256261,
+  "əd": 256252,
+  "ənd": 256232,
+  "əns": 256178,
+  "ənt": 256066,
+  "ənz": 256335,
+  "əs": 256044,
+  "əz": 256292,
+  "əɹ": 256194,
+  "əɹi": 256575,
+  "ə‍": 256008,
+  "ə‍l": 256037,
+  "ə‍li": 256530,
+  "ə‍ʊ": 256017,
+  "ə‍ʊk": 256475,
+  "ə‍ʊl": 256399,
+  "ə‍ʊld": 256326,
+  "ə‍ʊn": 256212,
+  "ə‍ʊnli": 256488,
+  "ə‍ʊp": 256586,
+  "ə‍ʊst": 256385,
+  "ə‍ʊt": 256541,
+  "ə‍ʊv": 256420,
+  "ə‍ʊvɐ": 256591,
+  "ə‍ʊz": 256285,
+  "ɛd": 256127,
+  "ɛf": 256491,
+  "ɛk": 256154,
+  "ɛks": 256422,
+  "ɛkt": 256195,
+  "ɛl": 256032,
+  "ɛlf": 256379,
+  "ɛlp": 256479,
+  "ɛlt": 256276,
+  "ɛm": 256122,
+  "ɛn": 256030,
+  "ɛnd": 256150,
+  "ɛni": 256331,
+  "ɛns": 256598,
+  "ɛnt": 256105,
+  "ɛp": 256449,
+  "ɛs": 256054,
+  "ɛst": 256124,
+  "ɛt": 256050,
+  "ɛtʁ": 256565,
+  "ɛt͡": 256308,
+  "ɛt͡st": 256455,
+  "ɛv": 256151,
+  "ɛvɐ": 256336,
+  "ɛz": 256281,
+  "ɛŋ": 256417,
+  "ɛɐ̯": 256071,
+  "ɛɹ": 256396,
+  "ɛɹi": 256363,
+  "ɛʁ": 256020,
+  "ɛʁn": 256386,
+  "ɛ̃": 256059,
+  "ɝz": 256206,
+  "ɡa": 256347,
+  "ɡe": 256201,
+  "ɡi": 256471,
+  "ɡn̩": 256081,
+  "ɡz": 256529,
+  "ɡɐ": 256427,
+  "ɡə": 256069,
+  "ɡɹ": 256371,
+  "ɡʁ": 256397,
+  "ɡʁo": 256605,
+  "ɣa": 256579,
+  "ɣo": 256329,
+  "ɥi": 256119,
+  "ɪd": 256041,
+  "ɪd‍ʒ": 256244,
+  "ɪf": 256102,
+  "ɪk": 256034,
+  "ɪks": 256561,
+  "ɪkt": 256450,
+  "ɪkə‍l": 256502,
+  "ɪl": 256058,
+  "ɪm": 256046,
+  "ɪmɐ": 256518,
+  "ɪn": 256010,
+  "ɪnd": 256176,
+  "ɪns": 256546,
+  "ɪnt": 256109,
+  "ɪntʊ": 256434,
+  "ɪp": 256207,
+  "ɪs": 256040,
+  "ɪst": 256068,
+  "ɪt": 256013,
+  "ɪti": 256234,
+  "ɪts": 256228,
+  "ɪtə‍l": 256531,
+  "ɪt‍ʃ": 256204,
+  "ɪv": 256118,
+  "ɪz": 256023,
+  "ɪç": 256015,
+  "ɪçt": 256096,
+  "ɪð": 256108,
+  "ɪŋ": 256019,
+  "ɪŋk": 256456,
+  "ɪŋz": 256583,
+  "ɪɡ": 256181,
+  "ɪɡn̩": 256316,
+  "ɪɡə": 256337,
+  "ɪɫ": 256421,
+  "ɪɹ": 256392,
+  "ɪʁ": 256270,
+  "ɪʃ": 256089,
+  "ɪʃn̩": 256257,
+  "ɪʃə": 256390,
+  "ɪʃən": 256301,
+  "ɪ̯": 256003,
+  "ɫa": 256476,
+  "ɫi": 256384,
+  "ɫz": 256354,
+  "ɹi": 256087,
+  "ɹə": 256110,
+  "ɹɪ": 256300,
+  "ɹɪŋ": 256468,
+  "ɾa": 256093,
+  "ɾan": 256424,
+  "ɾas": 256408,
+  "ɾe": 256104,
+  "ɾes": 256303,
+  "ɾi": 256141,
+  "ɾia": 256572,
+  "ɾo": 256134,
+  "ɾos": 256504,
+  "ʁa": 256036,
+  "ʁaɪ̯": 256163,
+  "ʁaʊ̯": 256219,
+  "ʁe": 256170,
+  "ʁi": 256061,
+  "ʁo": 256174,
+  "ʁu": 256473,
+  "ʁy": 256306,
+  "ʁɐ": 256452,
+  "ʁɔ": 256365,
+  "ʁə": 256157,
+  "ʁən": 256133,
+  "ʁɛ": 256090,
+  "ʁʊŋ": 256423,
+  "ʃa": 256290,
+  "ʃaft": 256594,
+  "ʃe": 256525,
+  "ʃi": 256346,
+  "ʃn̩": 256500,
+  "ʃp": 256267,
+  "ʃt": 256107,
+  "ʃta": 256271,
+  "ʃtɛl": 256577,
+  "ʃən": 256060,
+  "ʃənz": 256447,
+  "ʃə‍l": 256590,
+  "ʊd": 256129,
+  "ʊk": 256314,
+  "ʊl": 256532,
+  "ʊm": 256139,
+  "ʊn": 256156,
+  "ʊnt": 256025,
+  "ʊntɐ": 256307,
+  "ʊs": 256251,
+  "ʊt": 256597,
+  "ʊŋ": 256070,
+  "ʊŋən": 256325,
+  "ʊʁ": 256132,
+  "ʊʁç": 256294,
+  "ʊ̯": 256021,
+  "ʌm": 256164,
+  "ʌn": 256242,
+  "ʌnt": 256425,
+  "ʌp": 256302,
+  "ʌs": 256364,
+  "ʌst": 256268,
+  "ʌt": 256123,
+  "ʌt‍ʃ": 256342,
+  "ʌv": 256378,
+  "ʌðɐ": 256332,
+  "ʌŋ": 256496,
+  "ʎa": 256350,
+  "ʎe": 256483,
+  "ʏk": 256380,
+  "ʏn": 256573,
+  "ʏʁ": 256318,
+  "ʒe": 256460,
+  "ʒi": 256454,
+  "ː.": 256241,
+  "ːˈ": 256221,
+  "ːˌ": 256411,
+  "̯o": 256231,
+  "͡f": 256313,
+  "͡s": 256012,
+  "‍ə": 256029,
+  "‍ən": 256564,
+  "‍ɪ": 256001,
+  "‍ʃ": 256045,
+  "‍ʒ": 256055
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 1.0,
+    "total_flos": 2097228602327040.0,
+    "train_loss": 0.050915672732994244,
+    "train_runtime": 738268.7103,
+    "train_samples_per_second": 10.388,
+    "train_steps_per_second": 0.325
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

+ {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content }}{% endif %}{% endfor %}

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "Gemma2ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": 50.0,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 1,
+  "final_logit_softcapping": 30.0,
+  "head_dim": 256,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2304,
+  "initializer_range": 0.02,
+  "intermediate_size": 9216,
+  "max_position_embeddings": 8192,
+  "model_type": "gemma2",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.1",
+  "use_cache": false,
+  "vocab_size": 256606
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.52.1",
+  "use_cache": false
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b5e987b6ff50d86c9c9bad2d803bb1dd1f10baebc1e4df65fa16f1d275bab3f
+size 4990818208

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5189aa23cd8d9d48e7bce61a6e1a0106a8cda856ccd3c938e84908f96c3790bb
+size 240691728

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,295 @@

+{
+  "metadata": {
+    "total_size": 5231476224
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9bc615c6c14973dda6a5058c6ee0d3693d581fa5067b99c2c7af6388101fbb8
+size 34472979

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
+size 4241003

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 1.0,
+    "total_flos": 2097228602327040.0,
+    "train_loss": 0.050915672732994244,
+    "train_runtime": 738268.7103,
+    "train_samples_per_second": 10.388,
+    "train_steps_per_second": 0.325
+}

trainer_log.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:144360cec1f136fe47a8c04783405f9d6a6f86f02e4caaf557091c1f6aebd8e5
+size 7544