diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4eeacd4a3b7d757bda34333acd400906e8cd89db --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md @@ -0,0 +1,58 @@ +--- +base_model: google/gemma-4-31B +library_name: transformers +model_name: gemma-4-31B_original_features_structural_train_original_features_structural_test1 +tags: +- generated_from_trainer +- trl +- sft +licence: license +--- + +# Model Card for gemma-4-31B_original_features_structural_train_original_features_structural_test1 + +This model is a fine-tuned version of [google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/sfblzvnx) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.6.1 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/README.md b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..920897064cad23ae39b98fad16bd6f3c52a68044 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/README.md @@ -0,0 +1,58 @@ +--- +base_model: google/gemma-4-31B +library_name: transformers +model_name: gemma-4-31B_original_features_structural_train_original_features_structural_test2 +tags: +- generated_from_trainer +- trl +- sft +licence: license +--- + +# Model Card for gemma-4-31B_original_features_structural_train_original_features_structural_test2 + +This model is a fine-tuned version of [google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/ncgnoczk) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.6.1 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6144d96a813fa7fd1ee98cb6160f42880081fc05 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.00985279561940916, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c53679e8b51a819a5c7ea83dcb846ef04d0c9fa3 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json @@ -0,0 +1,297 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1155, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.353258643448353, + "epoch": 0.1299545159194282, + "grad_norm": 3.010725975036621, + "learning_rate": 4.8475852375026876e-05, + "loss": 5.475971069335937, + "mean_token_accuracy": 0.7263440760970116, + "num_tokens": 128842.0, + "step": 50 + }, + { + "entropy": 0.649170914888382, + "epoch": 0.2599090318388564, + "grad_norm": 1.9099390506744385, + "learning_rate": 9.794100785974817e-05, + "loss": 2.55168701171875, + "mean_token_accuracy": 0.8364580717682838, + "num_tokens": 255497.0, + "step": 100 + }, + { + "entropy": 0.5930788792669773, + "epoch": 0.3898635477582846, + "grad_norm": 2.1239051818847656, + "learning_rate": 0.0001474061633444695, + "loss": 2.3440716552734373, + "mean_token_accuracy": 0.8452290838956833, + "num_tokens": 372014.0, + "step": 150 + }, + { + "entropy": 0.5564522063732147, + "epoch": 0.5198180636777128, + "grad_norm": 411.71807861328125, + "learning_rate": 0.00019687131882919077, + "loss": 2.2838446044921876, + "mean_token_accuracy": 0.8498487600684166, + "num_tokens": 500623.0, + "step": 200 + }, + { + "entropy": 0.5539529167115689, + "epoch": 0.649772579597141, + "grad_norm": 2.1969902515411377, + "learning_rate": 0.0002463364743139121, + "loss": 2.675394287109375, + "mean_token_accuracy": 0.8430694487690925, + "num_tokens": 616223.0, + "step": 250 + }, + { + "entropy": 0.5719467167556286, + "epoch": 0.7797270955165692, + "grad_norm": 1.98796546459198, + "learning_rate": 0.00029580162979863343, + "loss": 2.2434300231933593, + "mean_token_accuracy": 0.851241897046566, + "num_tokens": 737263.0, + "step": 300 + }, + { + "entropy": 0.5502805083990097, + "epoch": 0.9096816114359974, + "grad_norm": 2.0211398601531982, + "learning_rate": 0.0003452667852833547, + "loss": 2.1729367065429686, + "mean_token_accuracy": 0.8554597494006156, + "num_tokens": 861477.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5580813550891784, + "eval_loss": 0.5830356478691101, + "eval_mean_token_accuracy": 0.8432669037809739, + "eval_num_tokens": 944782.0, + "eval_runtime": 90.3664, + "eval_samples_per_second": 18.336, + "eval_steps_per_second": 2.302, + "step": 385 + }, + { + "entropy": 0.5498402091725987, + "epoch": 1.0389863547758285, + "grad_norm": 3.8034188747406006, + "learning_rate": 0.000380866355527619, + "loss": 2.113946990966797, + "mean_token_accuracy": 0.8578129452676629, + "num_tokens": 982803.0, + "step": 400 + }, + { + "entropy": 0.5182110907137394, + "epoch": 1.1689408706952567, + "grad_norm": 2.7830824851989746, + "learning_rate": 0.0003805611725593471, + "loss": 1.9833453369140626, + "mean_token_accuracy": 0.8656822636723518, + "num_tokens": 1105926.0, + "step": 450 + }, + { + "entropy": 0.5260789206624031, + "epoch": 1.2988953866146848, + "grad_norm": 1.7993361949920654, + "learning_rate": 0.0003798653399371568, + "loss": 2.006897430419922, + "mean_token_accuracy": 0.8631055191159248, + "num_tokens": 1229857.0, + "step": 500 + }, + { + "entropy": 0.5327546864748001, + "epoch": 1.428849902534113, + "grad_norm": 1.7606678009033203, + "learning_rate": 0.0003787802874228295, + "loss": 2.020283050537109, + "mean_token_accuracy": 0.8638329988718033, + "num_tokens": 1352330.0, + "step": 550 + }, + { + "entropy": 0.5285360223054886, + "epoch": 1.5588044184535412, + "grad_norm": 4.76006555557251, + "learning_rate": 0.00037730824452755275, + "loss": 1.9987391662597656, + "mean_token_accuracy": 0.8644696187973022, + "num_tokens": 1474790.0, + "step": 600 + }, + { + "entropy": 0.5134804363548756, + "epoch": 1.6887589343729694, + "grad_norm": 1.8447264432907104, + "learning_rate": 0.000375452235930833, + "loss": 1.9669386291503905, + "mean_token_accuracy": 0.8659948265552521, + "num_tokens": 1600381.0, + "step": 650 + }, + { + "entropy": 0.5371069309115409, + "epoch": 1.8187134502923976, + "grad_norm": 1.6537392139434814, + "learning_rate": 0.00037321607526553675, + "loss": 2.0411550903320315, + "mean_token_accuracy": 0.8624854254722595, + "num_tokens": 1716827.0, + "step": 700 + }, + { + "entropy": 0.5270501750707627, + "epoch": 1.9486679662118258, + "grad_norm": 2.6990911960601807, + "learning_rate": 0.00037060435728183, + "loss": 2.015792236328125, + "mean_token_accuracy": 0.8631013777852058, + "num_tokens": 1842798.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5477195472384875, + "eval_loss": 0.5585702657699585, + "eval_mean_token_accuracy": 0.8486175815073344, + "eval_num_tokens": 1889564.0, + "eval_runtime": 90.2194, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 770 + }, + { + "entropy": 0.4782189565088282, + "epoch": 2.077972709551657, + "grad_norm": 2.041952610015869, + "learning_rate": 0.0003676224484061175, + "loss": 1.7843829345703126, + "mean_token_accuracy": 0.8739750406250881, + "num_tokens": 1959778.0, + "step": 800 + }, + { + "entropy": 0.4443667846918106, + "epoch": 2.207927225471085, + "grad_norm": 16.27313804626465, + "learning_rate": 0.00036427647571437996, + "loss": 1.6559255981445313, + "mean_token_accuracy": 0.8808386281132699, + "num_tokens": 2087384.0, + "step": 850 + }, + { + "entropy": 0.44861202985048293, + "epoch": 2.3378817413905133, + "grad_norm": 1.648870587348938, + "learning_rate": 0.0003605733143425679, + "loss": 1.677943878173828, + "mean_token_accuracy": 0.879555520415306, + "num_tokens": 2211962.0, + "step": 900 + }, + { + "entropy": 0.4568726105988026, + "epoch": 2.4678362573099415, + "grad_norm": 1.7573126554489136, + "learning_rate": 0.00035652057335991866, + "loss": 1.6760734558105468, + "mean_token_accuracy": 0.8791913360357284, + "num_tokens": 2334838.0, + "step": 950 + }, + { + "entropy": 0.44863338857889173, + "epoch": 2.5977907732293697, + "grad_norm": 1.8639047145843506, + "learning_rate": 0.00035212658013422465, + "loss": 1.6799411010742187, + "mean_token_accuracy": 0.8790675121545791, + "num_tokens": 2461732.0, + "step": 1000 + }, + { + "entropy": 0.4585830120742321, + "epoch": 2.727745289148798, + "grad_norm": 1.9825985431671143, + "learning_rate": 0.0003474003632211781, + "loss": 1.7172026062011718, + "mean_token_accuracy": 0.8782495930790901, + "num_tokens": 2580026.0, + "step": 1050 + }, + { + "entropy": 0.45422692246735097, + "epoch": 2.857699805068226, + "grad_norm": 1.7149962186813354, + "learning_rate": 0.00034235163381294995, + "loss": 1.679084014892578, + "mean_token_accuracy": 0.8795321774482727, + "num_tokens": 2705600.0, + "step": 1100 + }, + { + "entropy": 0.47297614574432373, + "epoch": 2.9876543209876543, + "grad_norm": 1.7435617446899414, + "learning_rate": 0.0003369907657841221, + "loss": 1.7386201477050782, + "mean_token_accuracy": 0.8779115182161331, + "num_tokens": 2822808.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5031588454372607, + "eval_loss": 0.5551120638847351, + "eval_mean_token_accuracy": 0.8531603300227568, + "eval_num_tokens": 2834346.0, + "eval_runtime": 90.2397, + "eval_samples_per_second": 18.362, + "eval_steps_per_second": 2.305, + "step": 1155 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.957948339009064e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6144d96a813fa7fd1ee98cb6160f42880081fc05 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.00985279561940916, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..43785a5a9fef645220936257116a6dff036a2eb2 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json @@ -0,0 +1,378 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.353258643448353, + "epoch": 0.1299545159194282, + "grad_norm": 3.010725975036621, + "learning_rate": 4.8475852375026876e-05, + "loss": 5.475971069335937, + "mean_token_accuracy": 0.7263440760970116, + "num_tokens": 128842.0, + "step": 50 + }, + { + "entropy": 0.649170914888382, + "epoch": 0.2599090318388564, + "grad_norm": 1.9099390506744385, + "learning_rate": 9.794100785974817e-05, + "loss": 2.55168701171875, + "mean_token_accuracy": 0.8364580717682838, + "num_tokens": 255497.0, + "step": 100 + }, + { + "entropy": 0.5930788792669773, + "epoch": 0.3898635477582846, + "grad_norm": 2.1239051818847656, + "learning_rate": 0.0001474061633444695, + "loss": 2.3440716552734373, + "mean_token_accuracy": 0.8452290838956833, + "num_tokens": 372014.0, + "step": 150 + }, + { + "entropy": 0.5564522063732147, + "epoch": 0.5198180636777128, + "grad_norm": 411.71807861328125, + "learning_rate": 0.00019687131882919077, + "loss": 2.2838446044921876, + "mean_token_accuracy": 0.8498487600684166, + "num_tokens": 500623.0, + "step": 200 + }, + { + "entropy": 0.5539529167115689, + "epoch": 0.649772579597141, + "grad_norm": 2.1969902515411377, + "learning_rate": 0.0002463364743139121, + "loss": 2.675394287109375, + "mean_token_accuracy": 0.8430694487690925, + "num_tokens": 616223.0, + "step": 250 + }, + { + "entropy": 0.5719467167556286, + "epoch": 0.7797270955165692, + "grad_norm": 1.98796546459198, + "learning_rate": 0.00029580162979863343, + "loss": 2.2434300231933593, + "mean_token_accuracy": 0.851241897046566, + "num_tokens": 737263.0, + "step": 300 + }, + { + "entropy": 0.5502805083990097, + "epoch": 0.9096816114359974, + "grad_norm": 2.0211398601531982, + "learning_rate": 0.0003452667852833547, + "loss": 2.1729367065429686, + "mean_token_accuracy": 0.8554597494006156, + "num_tokens": 861477.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5580813550891784, + "eval_loss": 0.5830356478691101, + "eval_mean_token_accuracy": 0.8432669037809739, + "eval_num_tokens": 944782.0, + "eval_runtime": 90.3664, + "eval_samples_per_second": 18.336, + "eval_steps_per_second": 2.302, + "step": 385 + }, + { + "entropy": 0.5498402091725987, + "epoch": 1.0389863547758285, + "grad_norm": 3.8034188747406006, + "learning_rate": 0.000380866355527619, + "loss": 2.113946990966797, + "mean_token_accuracy": 0.8578129452676629, + "num_tokens": 982803.0, + "step": 400 + }, + { + "entropy": 0.5182110907137394, + "epoch": 1.1689408706952567, + "grad_norm": 2.7830824851989746, + "learning_rate": 0.0003805611725593471, + "loss": 1.9833453369140626, + "mean_token_accuracy": 0.8656822636723518, + "num_tokens": 1105926.0, + "step": 450 + }, + { + "entropy": 0.5260789206624031, + "epoch": 1.2988953866146848, + "grad_norm": 1.7993361949920654, + "learning_rate": 0.0003798653399371568, + "loss": 2.006897430419922, + "mean_token_accuracy": 0.8631055191159248, + "num_tokens": 1229857.0, + "step": 500 + }, + { + "entropy": 0.5327546864748001, + "epoch": 1.428849902534113, + "grad_norm": 1.7606678009033203, + "learning_rate": 0.0003787802874228295, + "loss": 2.020283050537109, + "mean_token_accuracy": 0.8638329988718033, + "num_tokens": 1352330.0, + "step": 550 + }, + { + "entropy": 0.5285360223054886, + "epoch": 1.5588044184535412, + "grad_norm": 4.76006555557251, + "learning_rate": 0.00037730824452755275, + "loss": 1.9987391662597656, + "mean_token_accuracy": 0.8644696187973022, + "num_tokens": 1474790.0, + "step": 600 + }, + { + "entropy": 0.5134804363548756, + "epoch": 1.6887589343729694, + "grad_norm": 1.8447264432907104, + "learning_rate": 0.000375452235930833, + "loss": 1.9669386291503905, + "mean_token_accuracy": 0.8659948265552521, + "num_tokens": 1600381.0, + "step": 650 + }, + { + "entropy": 0.5371069309115409, + "epoch": 1.8187134502923976, + "grad_norm": 1.6537392139434814, + "learning_rate": 0.00037321607526553675, + "loss": 2.0411550903320315, + "mean_token_accuracy": 0.8624854254722595, + "num_tokens": 1716827.0, + "step": 700 + }, + { + "entropy": 0.5270501750707627, + "epoch": 1.9486679662118258, + "grad_norm": 2.6990911960601807, + "learning_rate": 0.00037060435728183, + "loss": 2.015792236328125, + "mean_token_accuracy": 0.8631013777852058, + "num_tokens": 1842798.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5477195472384875, + "eval_loss": 0.5585702657699585, + "eval_mean_token_accuracy": 0.8486175815073344, + "eval_num_tokens": 1889564.0, + "eval_runtime": 90.2194, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 770 + }, + { + "entropy": 0.4782189565088282, + "epoch": 2.077972709551657, + "grad_norm": 2.041952610015869, + "learning_rate": 0.0003676224484061175, + "loss": 1.7843829345703126, + "mean_token_accuracy": 0.8739750406250881, + "num_tokens": 1959778.0, + "step": 800 + }, + { + "entropy": 0.4443667846918106, + "epoch": 2.207927225471085, + "grad_norm": 16.27313804626465, + "learning_rate": 0.00036427647571437996, + "loss": 1.6559255981445313, + "mean_token_accuracy": 0.8808386281132699, + "num_tokens": 2087384.0, + "step": 850 + }, + { + "entropy": 0.44861202985048293, + "epoch": 2.3378817413905133, + "grad_norm": 1.648870587348938, + "learning_rate": 0.0003605733143425679, + "loss": 1.677943878173828, + "mean_token_accuracy": 0.879555520415306, + "num_tokens": 2211962.0, + "step": 900 + }, + { + "entropy": 0.4568726105988026, + "epoch": 2.4678362573099415, + "grad_norm": 1.7573126554489136, + "learning_rate": 0.00035652057335991866, + "loss": 1.6760734558105468, + "mean_token_accuracy": 0.8791913360357284, + "num_tokens": 2334838.0, + "step": 950 + }, + { + "entropy": 0.44863338857889173, + "epoch": 2.5977907732293697, + "grad_norm": 1.8639047145843506, + "learning_rate": 0.00035212658013422465, + "loss": 1.6799411010742187, + "mean_token_accuracy": 0.8790675121545791, + "num_tokens": 2461732.0, + "step": 1000 + }, + { + "entropy": 0.4585830120742321, + "epoch": 2.727745289148798, + "grad_norm": 1.9825985431671143, + "learning_rate": 0.0003474003632211781, + "loss": 1.7172026062011718, + "mean_token_accuracy": 0.8782495930790901, + "num_tokens": 2580026.0, + "step": 1050 + }, + { + "entropy": 0.45422692246735097, + "epoch": 2.857699805068226, + "grad_norm": 1.7149962186813354, + "learning_rate": 0.00034235163381294995, + "loss": 1.679084014892578, + "mean_token_accuracy": 0.8795321774482727, + "num_tokens": 2705600.0, + "step": 1100 + }, + { + "entropy": 0.47297614574432373, + "epoch": 2.9876543209876543, + "grad_norm": 1.7435617446899414, + "learning_rate": 0.0003369907657841221, + "loss": 1.7386201477050782, + "mean_token_accuracy": 0.8779115182161331, + "num_tokens": 2822808.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5031588454372607, + "eval_loss": 0.5551120638847351, + "eval_mean_token_accuracy": 0.8531603300227568, + "eval_num_tokens": 2834346.0, + "eval_runtime": 90.2397, + "eval_samples_per_second": 18.362, + "eval_steps_per_second": 2.305, + "step": 1155 + }, + { + "entropy": 0.37655152073457615, + "epoch": 3.116959064327485, + "grad_norm": 1.504384160041809, + "learning_rate": 0.0003313287743759729, + "loss": 1.3653451538085937, + "mean_token_accuracy": 0.8971295344769655, + "num_tokens": 2939773.0, + "step": 1200 + }, + { + "entropy": 0.37069276951253416, + "epoch": 3.246913580246914, + "grad_norm": 1.9665946960449219, + "learning_rate": 0.0003253772935629151, + "loss": 1.3458108520507812, + "mean_token_accuracy": 0.8982205548882485, + "num_tokens": 3063617.0, + "step": 1250 + }, + { + "entropy": 0.37295883789658546, + "epoch": 3.3768680961663415, + "grad_norm": 1.7501362562179565, + "learning_rate": 0.00031914855214759165, + "loss": 1.357562255859375, + "mean_token_accuracy": 0.8977113124728203, + "num_tokens": 3189800.0, + "step": 1300 + }, + { + "entropy": 0.3805788069963455, + "epoch": 3.50682261208577, + "grad_norm": 1.7277154922485352, + "learning_rate": 0.00031265534863374894, + "loss": 1.3735618591308594, + "mean_token_accuracy": 0.8962143072485924, + "num_tokens": 3311908.0, + "step": 1350 + }, + { + "entropy": 0.3840580120682716, + "epoch": 3.636777128005198, + "grad_norm": 2.2338802814483643, + "learning_rate": 0.0003059110249285165, + "loss": 1.3903216552734374, + "mean_token_accuracy": 0.8958476388454437, + "num_tokens": 3432934.0, + "step": 1400 + }, + { + "entropy": 0.37621145449578763, + "epoch": 3.7667316439246266, + "grad_norm": 1.9029661417007446, + "learning_rate": 0.00029892943892812944, + "loss": 1.3776657104492187, + "mean_token_accuracy": 0.8964926180243492, + "num_tokens": 3561408.0, + "step": 1450 + }, + { + "entropy": 0.3784803995490074, + "epoch": 3.8966861598440543, + "grad_norm": 2.089708089828491, + "learning_rate": 0.00029172493604342163, + "loss": 1.3816807556152344, + "mean_token_accuracy": 0.8962833172082901, + "num_tokens": 3684624.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4351254403591156, + "eval_loss": 0.5814722180366516, + "eval_mean_token_accuracy": 0.8530604747625498, + "eval_num_tokens": 3779128.0, + "eval_runtime": 90.2232, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 1540 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3259599564032195e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6144d96a813fa7fd1ee98cb6160f42880081fc05 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.00985279561940916, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..045103965b96b67419e62ecd21dff7b58bdf1ab7 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json @@ -0,0 +1,469 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1925, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.353258643448353, + "epoch": 0.1299545159194282, + "grad_norm": 3.010725975036621, + "learning_rate": 4.8475852375026876e-05, + "loss": 5.475971069335937, + "mean_token_accuracy": 0.7263440760970116, + "num_tokens": 128842.0, + "step": 50 + }, + { + "entropy": 0.649170914888382, + "epoch": 0.2599090318388564, + "grad_norm": 1.9099390506744385, + "learning_rate": 9.794100785974817e-05, + "loss": 2.55168701171875, + "mean_token_accuracy": 0.8364580717682838, + "num_tokens": 255497.0, + "step": 100 + }, + { + "entropy": 0.5930788792669773, + "epoch": 0.3898635477582846, + "grad_norm": 2.1239051818847656, + "learning_rate": 0.0001474061633444695, + "loss": 2.3440716552734373, + "mean_token_accuracy": 0.8452290838956833, + "num_tokens": 372014.0, + "step": 150 + }, + { + "entropy": 0.5564522063732147, + "epoch": 0.5198180636777128, + "grad_norm": 411.71807861328125, + "learning_rate": 0.00019687131882919077, + "loss": 2.2838446044921876, + "mean_token_accuracy": 0.8498487600684166, + "num_tokens": 500623.0, + "step": 200 + }, + { + "entropy": 0.5539529167115689, + "epoch": 0.649772579597141, + "grad_norm": 2.1969902515411377, + "learning_rate": 0.0002463364743139121, + "loss": 2.675394287109375, + "mean_token_accuracy": 0.8430694487690925, + "num_tokens": 616223.0, + "step": 250 + }, + { + "entropy": 0.5719467167556286, + "epoch": 0.7797270955165692, + "grad_norm": 1.98796546459198, + "learning_rate": 0.00029580162979863343, + "loss": 2.2434300231933593, + "mean_token_accuracy": 0.851241897046566, + "num_tokens": 737263.0, + "step": 300 + }, + { + "entropy": 0.5502805083990097, + "epoch": 0.9096816114359974, + "grad_norm": 2.0211398601531982, + "learning_rate": 0.0003452667852833547, + "loss": 2.1729367065429686, + "mean_token_accuracy": 0.8554597494006156, + "num_tokens": 861477.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5580813550891784, + "eval_loss": 0.5830356478691101, + "eval_mean_token_accuracy": 0.8432669037809739, + "eval_num_tokens": 944782.0, + "eval_runtime": 90.3664, + "eval_samples_per_second": 18.336, + "eval_steps_per_second": 2.302, + "step": 385 + }, + { + "entropy": 0.5498402091725987, + "epoch": 1.0389863547758285, + "grad_norm": 3.8034188747406006, + "learning_rate": 0.000380866355527619, + "loss": 2.113946990966797, + "mean_token_accuracy": 0.8578129452676629, + "num_tokens": 982803.0, + "step": 400 + }, + { + "entropy": 0.5182110907137394, + "epoch": 1.1689408706952567, + "grad_norm": 2.7830824851989746, + "learning_rate": 0.0003805611725593471, + "loss": 1.9833453369140626, + "mean_token_accuracy": 0.8656822636723518, + "num_tokens": 1105926.0, + "step": 450 + }, + { + "entropy": 0.5260789206624031, + "epoch": 1.2988953866146848, + "grad_norm": 1.7993361949920654, + "learning_rate": 0.0003798653399371568, + "loss": 2.006897430419922, + "mean_token_accuracy": 0.8631055191159248, + "num_tokens": 1229857.0, + "step": 500 + }, + { + "entropy": 0.5327546864748001, + "epoch": 1.428849902534113, + "grad_norm": 1.7606678009033203, + "learning_rate": 0.0003787802874228295, + "loss": 2.020283050537109, + "mean_token_accuracy": 0.8638329988718033, + "num_tokens": 1352330.0, + "step": 550 + }, + { + "entropy": 0.5285360223054886, + "epoch": 1.5588044184535412, + "grad_norm": 4.76006555557251, + "learning_rate": 0.00037730824452755275, + "loss": 1.9987391662597656, + "mean_token_accuracy": 0.8644696187973022, + "num_tokens": 1474790.0, + "step": 600 + }, + { + "entropy": 0.5134804363548756, + "epoch": 1.6887589343729694, + "grad_norm": 1.8447264432907104, + "learning_rate": 0.000375452235930833, + "loss": 1.9669386291503905, + "mean_token_accuracy": 0.8659948265552521, + "num_tokens": 1600381.0, + "step": 650 + }, + { + "entropy": 0.5371069309115409, + "epoch": 1.8187134502923976, + "grad_norm": 1.6537392139434814, + "learning_rate": 0.00037321607526553675, + "loss": 2.0411550903320315, + "mean_token_accuracy": 0.8624854254722595, + "num_tokens": 1716827.0, + "step": 700 + }, + { + "entropy": 0.5270501750707627, + "epoch": 1.9486679662118258, + "grad_norm": 2.6990911960601807, + "learning_rate": 0.00037060435728183, + "loss": 2.015792236328125, + "mean_token_accuracy": 0.8631013777852058, + "num_tokens": 1842798.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5477195472384875, + "eval_loss": 0.5585702657699585, + "eval_mean_token_accuracy": 0.8486175815073344, + "eval_num_tokens": 1889564.0, + "eval_runtime": 90.2194, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 770 + }, + { + "entropy": 0.4782189565088282, + "epoch": 2.077972709551657, + "grad_norm": 2.041952610015869, + "learning_rate": 0.0003676224484061175, + "loss": 1.7843829345703126, + "mean_token_accuracy": 0.8739750406250881, + "num_tokens": 1959778.0, + "step": 800 + }, + { + "entropy": 0.4443667846918106, + "epoch": 2.207927225471085, + "grad_norm": 16.27313804626465, + "learning_rate": 0.00036427647571437996, + "loss": 1.6559255981445313, + "mean_token_accuracy": 0.8808386281132699, + "num_tokens": 2087384.0, + "step": 850 + }, + { + "entropy": 0.44861202985048293, + "epoch": 2.3378817413905133, + "grad_norm": 1.648870587348938, + "learning_rate": 0.0003605733143425679, + "loss": 1.677943878173828, + "mean_token_accuracy": 0.879555520415306, + "num_tokens": 2211962.0, + "step": 900 + }, + { + "entropy": 0.4568726105988026, + "epoch": 2.4678362573099415, + "grad_norm": 1.7573126554489136, + "learning_rate": 0.00035652057335991866, + "loss": 1.6760734558105468, + "mean_token_accuracy": 0.8791913360357284, + "num_tokens": 2334838.0, + "step": 950 + }, + { + "entropy": 0.44863338857889173, + "epoch": 2.5977907732293697, + "grad_norm": 1.8639047145843506, + "learning_rate": 0.00035212658013422465, + "loss": 1.6799411010742187, + "mean_token_accuracy": 0.8790675121545791, + "num_tokens": 2461732.0, + "step": 1000 + }, + { + "entropy": 0.4585830120742321, + "epoch": 2.727745289148798, + "grad_norm": 1.9825985431671143, + "learning_rate": 0.0003474003632211781, + "loss": 1.7172026062011718, + "mean_token_accuracy": 0.8782495930790901, + "num_tokens": 2580026.0, + "step": 1050 + }, + { + "entropy": 0.45422692246735097, + "epoch": 2.857699805068226, + "grad_norm": 1.7149962186813354, + "learning_rate": 0.00034235163381294995, + "loss": 1.679084014892578, + "mean_token_accuracy": 0.8795321774482727, + "num_tokens": 2705600.0, + "step": 1100 + }, + { + "entropy": 0.47297614574432373, + "epoch": 2.9876543209876543, + "grad_norm": 1.7435617446899414, + "learning_rate": 0.0003369907657841221, + "loss": 1.7386201477050782, + "mean_token_accuracy": 0.8779115182161331, + "num_tokens": 2822808.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5031588454372607, + "eval_loss": 0.5551120638847351, + "eval_mean_token_accuracy": 0.8531603300227568, + "eval_num_tokens": 2834346.0, + "eval_runtime": 90.2397, + "eval_samples_per_second": 18.362, + "eval_steps_per_second": 2.305, + "step": 1155 + }, + { + "entropy": 0.37655152073457615, + "epoch": 3.116959064327485, + "grad_norm": 1.504384160041809, + "learning_rate": 0.0003313287743759729, + "loss": 1.3653451538085937, + "mean_token_accuracy": 0.8971295344769655, + "num_tokens": 2939773.0, + "step": 1200 + }, + { + "entropy": 0.37069276951253416, + "epoch": 3.246913580246914, + "grad_norm": 1.9665946960449219, + "learning_rate": 0.0003253772935629151, + "loss": 1.3458108520507812, + "mean_token_accuracy": 0.8982205548882485, + "num_tokens": 3063617.0, + "step": 1250 + }, + { + "entropy": 0.37295883789658546, + "epoch": 3.3768680961663415, + "grad_norm": 1.7501362562179565, + "learning_rate": 0.00031914855214759165, + "loss": 1.357562255859375, + "mean_token_accuracy": 0.8977113124728203, + "num_tokens": 3189800.0, + "step": 1300 + }, + { + "entropy": 0.3805788069963455, + "epoch": 3.50682261208577, + "grad_norm": 1.7277154922485352, + "learning_rate": 0.00031265534863374894, + "loss": 1.3735618591308594, + "mean_token_accuracy": 0.8962143072485924, + "num_tokens": 3311908.0, + "step": 1350 + }, + { + "entropy": 0.3840580120682716, + "epoch": 3.636777128005198, + "grad_norm": 2.2338802814483643, + "learning_rate": 0.0003059110249285165, + "loss": 1.3903216552734374, + "mean_token_accuracy": 0.8958476388454437, + "num_tokens": 3432934.0, + "step": 1400 + }, + { + "entropy": 0.37621145449578763, + "epoch": 3.7667316439246266, + "grad_norm": 1.9029661417007446, + "learning_rate": 0.00029892943892812944, + "loss": 1.3776657104492187, + "mean_token_accuracy": 0.8964926180243492, + "num_tokens": 3561408.0, + "step": 1450 + }, + { + "entropy": 0.3784803995490074, + "epoch": 3.8966861598440543, + "grad_norm": 2.089708089828491, + "learning_rate": 0.00029172493604342163, + "loss": 1.3816807556152344, + "mean_token_accuracy": 0.8962833172082901, + "num_tokens": 3684624.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4351254403591156, + "eval_loss": 0.5814722180366516, + "eval_mean_token_accuracy": 0.8530604747625498, + "eval_num_tokens": 3779128.0, + "eval_runtime": 90.2232, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 1540 + }, + { + "entropy": 0.36326556409423677, + "epoch": 4.025990903183885, + "grad_norm": 2.1354947090148926, + "learning_rate": 0.0002843123197235993, + "loss": 1.3295362854003907, + "mean_token_accuracy": 0.8993093811686913, + "num_tokens": 3804993.0, + "step": 1550 + }, + { + "entropy": 0.2879397062957287, + "epoch": 4.155945419103314, + "grad_norm": 2.201097011566162, + "learning_rate": 0.0002767068210388601, + "loss": 1.0272974395751953, + "mean_token_accuracy": 0.9182627710700035, + "num_tokens": 3928162.0, + "step": 1600 + }, + { + "entropy": 0.2848948486149311, + "epoch": 4.2858999350227425, + "grad_norm": 2.01479172706604, + "learning_rate": 0.000268924067384358, + "loss": 1.0278727722167968, + "mean_token_accuracy": 0.9194766515493393, + "num_tokens": 4049012.0, + "step": 1650 + }, + { + "entropy": 0.2940504560619593, + "epoch": 4.41585445094217, + "grad_norm": 2.0893027782440186, + "learning_rate": 0.00026098005036982003, + "loss": 1.0586751556396485, + "mean_token_accuracy": 0.9167885810136795, + "num_tokens": 4167845.0, + "step": 1700 + }, + { + "entropy": 0.293505182415247, + "epoch": 4.545808966861598, + "grad_norm": 1.6346389055252075, + "learning_rate": 0.0002528910929607928, + "loss": 1.0669570922851563, + "mean_token_accuracy": 0.9160876458883286, + "num_tokens": 4287505.0, + "step": 1750 + }, + { + "entropy": 0.2898535231500864, + "epoch": 4.675763482781027, + "grad_norm": 1.6645033359527588, + "learning_rate": 0.0002446738159390364, + "loss": 1.0582612609863282, + "mean_token_accuracy": 0.9177632886171341, + "num_tokens": 4412221.0, + "step": 1800 + }, + { + "entropy": 0.2842763290554285, + "epoch": 4.805717998700455, + "grad_norm": 2.4594268798828125, + "learning_rate": 0.0002363451037509798, + "loss": 1.0467537689208983, + "mean_token_accuracy": 0.9177608361840248, + "num_tokens": 4537178.0, + "step": 1850 + }, + { + "entropy": 0.284430123642087, + "epoch": 4.935672514619883, + "grad_norm": 2.1724514961242676, + "learning_rate": 0.00022792206981441223, + "loss": 1.0753899383544923, + "mean_token_accuracy": 0.915192686021328, + "num_tokens": 4664196.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3632780872285366, + "eval_loss": 0.6438126564025879, + "eval_mean_token_accuracy": 0.8511462942338907, + "eval_num_tokens": 4723910.0, + "eval_runtime": 90.1846, + "eval_samples_per_second": 18.373, + "eval_steps_per_second": 2.306, + "step": 1925 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6564080889424607e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6144d96a813fa7fd1ee98cb6160f42880081fc05 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.00985279561940916, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4e91b74233d2370bd5168eda1b78cdedaca5404e --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json @@ -0,0 +1,560 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 2310, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.353258643448353, + "epoch": 0.1299545159194282, + "grad_norm": 3.010725975036621, + "learning_rate": 4.8475852375026876e-05, + "loss": 5.475971069335937, + "mean_token_accuracy": 0.7263440760970116, + "num_tokens": 128842.0, + "step": 50 + }, + { + "entropy": 0.649170914888382, + "epoch": 0.2599090318388564, + "grad_norm": 1.9099390506744385, + "learning_rate": 9.794100785974817e-05, + "loss": 2.55168701171875, + "mean_token_accuracy": 0.8364580717682838, + "num_tokens": 255497.0, + "step": 100 + }, + { + "entropy": 0.5930788792669773, + "epoch": 0.3898635477582846, + "grad_norm": 2.1239051818847656, + "learning_rate": 0.0001474061633444695, + "loss": 2.3440716552734373, + "mean_token_accuracy": 0.8452290838956833, + "num_tokens": 372014.0, + "step": 150 + }, + { + "entropy": 0.5564522063732147, + "epoch": 0.5198180636777128, + "grad_norm": 411.71807861328125, + "learning_rate": 0.00019687131882919077, + "loss": 2.2838446044921876, + "mean_token_accuracy": 0.8498487600684166, + "num_tokens": 500623.0, + "step": 200 + }, + { + "entropy": 0.5539529167115689, + "epoch": 0.649772579597141, + "grad_norm": 2.1969902515411377, + "learning_rate": 0.0002463364743139121, + "loss": 2.675394287109375, + "mean_token_accuracy": 0.8430694487690925, + "num_tokens": 616223.0, + "step": 250 + }, + { + "entropy": 0.5719467167556286, + "epoch": 0.7797270955165692, + "grad_norm": 1.98796546459198, + "learning_rate": 0.00029580162979863343, + "loss": 2.2434300231933593, + "mean_token_accuracy": 0.851241897046566, + "num_tokens": 737263.0, + "step": 300 + }, + { + "entropy": 0.5502805083990097, + "epoch": 0.9096816114359974, + "grad_norm": 2.0211398601531982, + "learning_rate": 0.0003452667852833547, + "loss": 2.1729367065429686, + "mean_token_accuracy": 0.8554597494006156, + "num_tokens": 861477.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5580813550891784, + "eval_loss": 0.5830356478691101, + "eval_mean_token_accuracy": 0.8432669037809739, + "eval_num_tokens": 944782.0, + "eval_runtime": 90.3664, + "eval_samples_per_second": 18.336, + "eval_steps_per_second": 2.302, + "step": 385 + }, + { + "entropy": 0.5498402091725987, + "epoch": 1.0389863547758285, + "grad_norm": 3.8034188747406006, + "learning_rate": 0.000380866355527619, + "loss": 2.113946990966797, + "mean_token_accuracy": 0.8578129452676629, + "num_tokens": 982803.0, + "step": 400 + }, + { + "entropy": 0.5182110907137394, + "epoch": 1.1689408706952567, + "grad_norm": 2.7830824851989746, + "learning_rate": 0.0003805611725593471, + "loss": 1.9833453369140626, + "mean_token_accuracy": 0.8656822636723518, + "num_tokens": 1105926.0, + "step": 450 + }, + { + "entropy": 0.5260789206624031, + "epoch": 1.2988953866146848, + "grad_norm": 1.7993361949920654, + "learning_rate": 0.0003798653399371568, + "loss": 2.006897430419922, + "mean_token_accuracy": 0.8631055191159248, + "num_tokens": 1229857.0, + "step": 500 + }, + { + "entropy": 0.5327546864748001, + "epoch": 1.428849902534113, + "grad_norm": 1.7606678009033203, + "learning_rate": 0.0003787802874228295, + "loss": 2.020283050537109, + "mean_token_accuracy": 0.8638329988718033, + "num_tokens": 1352330.0, + "step": 550 + }, + { + "entropy": 0.5285360223054886, + "epoch": 1.5588044184535412, + "grad_norm": 4.76006555557251, + "learning_rate": 0.00037730824452755275, + "loss": 1.9987391662597656, + "mean_token_accuracy": 0.8644696187973022, + "num_tokens": 1474790.0, + "step": 600 + }, + { + "entropy": 0.5134804363548756, + "epoch": 1.6887589343729694, + "grad_norm": 1.8447264432907104, + "learning_rate": 0.000375452235930833, + "loss": 1.9669386291503905, + "mean_token_accuracy": 0.8659948265552521, + "num_tokens": 1600381.0, + "step": 650 + }, + { + "entropy": 0.5371069309115409, + "epoch": 1.8187134502923976, + "grad_norm": 1.6537392139434814, + "learning_rate": 0.00037321607526553675, + "loss": 2.0411550903320315, + "mean_token_accuracy": 0.8624854254722595, + "num_tokens": 1716827.0, + "step": 700 + }, + { + "entropy": 0.5270501750707627, + "epoch": 1.9486679662118258, + "grad_norm": 2.6990911960601807, + "learning_rate": 0.00037060435728183, + "loss": 2.015792236328125, + "mean_token_accuracy": 0.8631013777852058, + "num_tokens": 1842798.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5477195472384875, + "eval_loss": 0.5585702657699585, + "eval_mean_token_accuracy": 0.8486175815073344, + "eval_num_tokens": 1889564.0, + "eval_runtime": 90.2194, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 770 + }, + { + "entropy": 0.4782189565088282, + "epoch": 2.077972709551657, + "grad_norm": 2.041952610015869, + "learning_rate": 0.0003676224484061175, + "loss": 1.7843829345703126, + "mean_token_accuracy": 0.8739750406250881, + "num_tokens": 1959778.0, + "step": 800 + }, + { + "entropy": 0.4443667846918106, + "epoch": 2.207927225471085, + "grad_norm": 16.27313804626465, + "learning_rate": 0.00036427647571437996, + "loss": 1.6559255981445313, + "mean_token_accuracy": 0.8808386281132699, + "num_tokens": 2087384.0, + "step": 850 + }, + { + "entropy": 0.44861202985048293, + "epoch": 2.3378817413905133, + "grad_norm": 1.648870587348938, + "learning_rate": 0.0003605733143425679, + "loss": 1.677943878173828, + "mean_token_accuracy": 0.879555520415306, + "num_tokens": 2211962.0, + "step": 900 + }, + { + "entropy": 0.4568726105988026, + "epoch": 2.4678362573099415, + "grad_norm": 1.7573126554489136, + "learning_rate": 0.00035652057335991866, + "loss": 1.6760734558105468, + "mean_token_accuracy": 0.8791913360357284, + "num_tokens": 2334838.0, + "step": 950 + }, + { + "entropy": 0.44863338857889173, + "epoch": 2.5977907732293697, + "grad_norm": 1.8639047145843506, + "learning_rate": 0.00035212658013422465, + "loss": 1.6799411010742187, + "mean_token_accuracy": 0.8790675121545791, + "num_tokens": 2461732.0, + "step": 1000 + }, + { + "entropy": 0.4585830120742321, + "epoch": 2.727745289148798, + "grad_norm": 1.9825985431671143, + "learning_rate": 0.0003474003632211781, + "loss": 1.7172026062011718, + "mean_token_accuracy": 0.8782495930790901, + "num_tokens": 2580026.0, + "step": 1050 + }, + { + "entropy": 0.45422692246735097, + "epoch": 2.857699805068226, + "grad_norm": 1.7149962186813354, + "learning_rate": 0.00034235163381294995, + "loss": 1.679084014892578, + "mean_token_accuracy": 0.8795321774482727, + "num_tokens": 2705600.0, + "step": 1100 + }, + { + "entropy": 0.47297614574432373, + "epoch": 2.9876543209876543, + "grad_norm": 1.7435617446899414, + "learning_rate": 0.0003369907657841221, + "loss": 1.7386201477050782, + "mean_token_accuracy": 0.8779115182161331, + "num_tokens": 2822808.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5031588454372607, + "eval_loss": 0.5551120638847351, + "eval_mean_token_accuracy": 0.8531603300227568, + "eval_num_tokens": 2834346.0, + "eval_runtime": 90.2397, + "eval_samples_per_second": 18.362, + "eval_steps_per_second": 2.305, + "step": 1155 + }, + { + "entropy": 0.37655152073457615, + "epoch": 3.116959064327485, + "grad_norm": 1.504384160041809, + "learning_rate": 0.0003313287743759729, + "loss": 1.3653451538085937, + "mean_token_accuracy": 0.8971295344769655, + "num_tokens": 2939773.0, + "step": 1200 + }, + { + "entropy": 0.37069276951253416, + "epoch": 3.246913580246914, + "grad_norm": 1.9665946960449219, + "learning_rate": 0.0003253772935629151, + "loss": 1.3458108520507812, + "mean_token_accuracy": 0.8982205548882485, + "num_tokens": 3063617.0, + "step": 1250 + }, + { + "entropy": 0.37295883789658546, + "epoch": 3.3768680961663415, + "grad_norm": 1.7501362562179565, + "learning_rate": 0.00031914855214759165, + "loss": 1.357562255859375, + "mean_token_accuracy": 0.8977113124728203, + "num_tokens": 3189800.0, + "step": 1300 + }, + { + "entropy": 0.3805788069963455, + "epoch": 3.50682261208577, + "grad_norm": 1.7277154922485352, + "learning_rate": 0.00031265534863374894, + "loss": 1.3735618591308594, + "mean_token_accuracy": 0.8962143072485924, + "num_tokens": 3311908.0, + "step": 1350 + }, + { + "entropy": 0.3840580120682716, + "epoch": 3.636777128005198, + "grad_norm": 2.2338802814483643, + "learning_rate": 0.0003059110249285165, + "loss": 1.3903216552734374, + "mean_token_accuracy": 0.8958476388454437, + "num_tokens": 3432934.0, + "step": 1400 + }, + { + "entropy": 0.37621145449578763, + "epoch": 3.7667316439246266, + "grad_norm": 1.9029661417007446, + "learning_rate": 0.00029892943892812944, + "loss": 1.3776657104492187, + "mean_token_accuracy": 0.8964926180243492, + "num_tokens": 3561408.0, + "step": 1450 + }, + { + "entropy": 0.3784803995490074, + "epoch": 3.8966861598440543, + "grad_norm": 2.089708089828491, + "learning_rate": 0.00029172493604342163, + "loss": 1.3816807556152344, + "mean_token_accuracy": 0.8962833172082901, + "num_tokens": 3684624.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4351254403591156, + "eval_loss": 0.5814722180366516, + "eval_mean_token_accuracy": 0.8530604747625498, + "eval_num_tokens": 3779128.0, + "eval_runtime": 90.2232, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 1540 + }, + { + "entropy": 0.36326556409423677, + "epoch": 4.025990903183885, + "grad_norm": 2.1354947090148926, + "learning_rate": 0.0002843123197235993, + "loss": 1.3295362854003907, + "mean_token_accuracy": 0.8993093811686913, + "num_tokens": 3804993.0, + "step": 1550 + }, + { + "entropy": 0.2879397062957287, + "epoch": 4.155945419103314, + "grad_norm": 2.201097011566162, + "learning_rate": 0.0002767068210388601, + "loss": 1.0272974395751953, + "mean_token_accuracy": 0.9182627710700035, + "num_tokens": 3928162.0, + "step": 1600 + }, + { + "entropy": 0.2848948486149311, + "epoch": 4.2858999350227425, + "grad_norm": 2.01479172706604, + "learning_rate": 0.000268924067384358, + "loss": 1.0278727722167968, + "mean_token_accuracy": 0.9194766515493393, + "num_tokens": 4049012.0, + "step": 1650 + }, + { + "entropy": 0.2940504560619593, + "epoch": 4.41585445094217, + "grad_norm": 2.0893027782440186, + "learning_rate": 0.00026098005036982003, + "loss": 1.0586751556396485, + "mean_token_accuracy": 0.9167885810136795, + "num_tokens": 4167845.0, + "step": 1700 + }, + { + "entropy": 0.293505182415247, + "epoch": 4.545808966861598, + "grad_norm": 1.6346389055252075, + "learning_rate": 0.0002528910929607928, + "loss": 1.0669570922851563, + "mean_token_accuracy": 0.9160876458883286, + "num_tokens": 4287505.0, + "step": 1750 + }, + { + "entropy": 0.2898535231500864, + "epoch": 4.675763482781027, + "grad_norm": 1.6645033359527588, + "learning_rate": 0.0002446738159390364, + "loss": 1.0582612609863282, + "mean_token_accuracy": 0.9177632886171341, + "num_tokens": 4412221.0, + "step": 1800 + }, + { + "entropy": 0.2842763290554285, + "epoch": 4.805717998700455, + "grad_norm": 2.4594268798828125, + "learning_rate": 0.0002363451037509798, + "loss": 1.0467537689208983, + "mean_token_accuracy": 0.9177608361840248, + "num_tokens": 4537178.0, + "step": 1850 + }, + { + "entropy": 0.284430123642087, + "epoch": 4.935672514619883, + "grad_norm": 2.1724514961242676, + "learning_rate": 0.00022792206981441223, + "loss": 1.0753899383544923, + "mean_token_accuracy": 0.915192686021328, + "num_tokens": 4664196.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3632780872285366, + "eval_loss": 0.6438126564025879, + "eval_mean_token_accuracy": 0.8511462942338907, + "eval_num_tokens": 4723910.0, + "eval_runtime": 90.1846, + "eval_samples_per_second": 18.373, + "eval_steps_per_second": 2.306, + "step": 1925 + }, + { + "entropy": 0.23515464736139355, + "epoch": 5.064977257959714, + "grad_norm": 1.651587724685669, + "learning_rate": 0.00021942202135469513, + "loss": 0.8597064971923828, + "mean_token_accuracy": 0.9324622603517082, + "num_tokens": 4789568.0, + "step": 1950 + }, + { + "entropy": 0.1958953895419836, + "epoch": 5.1949317738791425, + "grad_norm": 1.923292636871338, + "learning_rate": 0.0002108624238427481, + "loss": 0.7188112640380859, + "mean_token_accuracy": 0.9416415295004845, + "num_tokens": 4913407.0, + "step": 2000 + }, + { + "entropy": 0.21068542070686816, + "epoch": 5.32488628979857, + "grad_norm": 2.299356460571289, + "learning_rate": 0.0002022608651078804, + "loss": 0.7712985229492187, + "mean_token_accuracy": 0.9386440163850784, + "num_tokens": 5032951.0, + "step": 2050 + }, + { + "entropy": 0.21234643168747425, + "epoch": 5.454840805717999, + "grad_norm": 2.2119295597076416, + "learning_rate": 0.00019363501919920608, + "loss": 0.7650181579589844, + "mean_token_accuracy": 0.938471505343914, + "num_tokens": 5156908.0, + "step": 2100 + }, + { + "entropy": 0.21658269092440605, + "epoch": 5.584795321637427, + "grad_norm": 1.5394288301467896, + "learning_rate": 0.00018500261006989887, + "loss": 0.7784209442138672, + "mean_token_accuracy": 0.9371598136425018, + "num_tokens": 5276087.0, + "step": 2150 + }, + { + "entropy": 0.2045296123996377, + "epoch": 5.714749837556855, + "grad_norm": 1.913680076599121, + "learning_rate": 0.00017638137515890763, + "loss": 0.7638166046142578, + "mean_token_accuracy": 0.9378301629424095, + "num_tokens": 5398787.0, + "step": 2200 + }, + { + "entropy": 0.20917976945638656, + "epoch": 5.844704353476283, + "grad_norm": 2.0847299098968506, + "learning_rate": 0.00016778902894496063, + "loss": 0.7631703186035156, + "mean_token_accuracy": 0.9387557968497277, + "num_tokens": 5522332.0, + "step": 2250 + }, + { + "entropy": 0.22262076318264007, + "epoch": 5.974658869395712, + "grad_norm": 2.1597352027893066, + "learning_rate": 0.0001592432265477485, + "loss": 0.798133773803711, + "mean_token_accuracy": 0.936034984588623, + "num_tokens": 5642361.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.31502799331568754, + "eval_loss": 0.7417300343513489, + "eval_mean_token_accuracy": 0.8477253922476218, + "eval_num_tokens": 5668692.0, + "eval_runtime": 90.4252, + "eval_samples_per_second": 18.325, + "eval_steps_per_second": 2.3, + "step": 2310 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9871331143277489e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6144d96a813fa7fd1ee98cb6160f42880081fc05 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.00985279561940916, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..be7d4ed0fedbbb3e6f780b12a7ff0327e3d8b947 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json @@ -0,0 +1,641 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 2695, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.353258643448353, + "epoch": 0.1299545159194282, + "grad_norm": 3.010725975036621, + "learning_rate": 4.8475852375026876e-05, + "loss": 5.475971069335937, + "mean_token_accuracy": 0.7263440760970116, + "num_tokens": 128842.0, + "step": 50 + }, + { + "entropy": 0.649170914888382, + "epoch": 0.2599090318388564, + "grad_norm": 1.9099390506744385, + "learning_rate": 9.794100785974817e-05, + "loss": 2.55168701171875, + "mean_token_accuracy": 0.8364580717682838, + "num_tokens": 255497.0, + "step": 100 + }, + { + "entropy": 0.5930788792669773, + "epoch": 0.3898635477582846, + "grad_norm": 2.1239051818847656, + "learning_rate": 0.0001474061633444695, + "loss": 2.3440716552734373, + "mean_token_accuracy": 0.8452290838956833, + "num_tokens": 372014.0, + "step": 150 + }, + { + "entropy": 0.5564522063732147, + "epoch": 0.5198180636777128, + "grad_norm": 411.71807861328125, + "learning_rate": 0.00019687131882919077, + "loss": 2.2838446044921876, + "mean_token_accuracy": 0.8498487600684166, + "num_tokens": 500623.0, + "step": 200 + }, + { + "entropy": 0.5539529167115689, + "epoch": 0.649772579597141, + "grad_norm": 2.1969902515411377, + "learning_rate": 0.0002463364743139121, + "loss": 2.675394287109375, + "mean_token_accuracy": 0.8430694487690925, + "num_tokens": 616223.0, + "step": 250 + }, + { + "entropy": 0.5719467167556286, + "epoch": 0.7797270955165692, + "grad_norm": 1.98796546459198, + "learning_rate": 0.00029580162979863343, + "loss": 2.2434300231933593, + "mean_token_accuracy": 0.851241897046566, + "num_tokens": 737263.0, + "step": 300 + }, + { + "entropy": 0.5502805083990097, + "epoch": 0.9096816114359974, + "grad_norm": 2.0211398601531982, + "learning_rate": 0.0003452667852833547, + "loss": 2.1729367065429686, + "mean_token_accuracy": 0.8554597494006156, + "num_tokens": 861477.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5580813550891784, + "eval_loss": 0.5830356478691101, + "eval_mean_token_accuracy": 0.8432669037809739, + "eval_num_tokens": 944782.0, + "eval_runtime": 90.3664, + "eval_samples_per_second": 18.336, + "eval_steps_per_second": 2.302, + "step": 385 + }, + { + "entropy": 0.5498402091725987, + "epoch": 1.0389863547758285, + "grad_norm": 3.8034188747406006, + "learning_rate": 0.000380866355527619, + "loss": 2.113946990966797, + "mean_token_accuracy": 0.8578129452676629, + "num_tokens": 982803.0, + "step": 400 + }, + { + "entropy": 0.5182110907137394, + "epoch": 1.1689408706952567, + "grad_norm": 2.7830824851989746, + "learning_rate": 0.0003805611725593471, + "loss": 1.9833453369140626, + "mean_token_accuracy": 0.8656822636723518, + "num_tokens": 1105926.0, + "step": 450 + }, + { + "entropy": 0.5260789206624031, + "epoch": 1.2988953866146848, + "grad_norm": 1.7993361949920654, + "learning_rate": 0.0003798653399371568, + "loss": 2.006897430419922, + "mean_token_accuracy": 0.8631055191159248, + "num_tokens": 1229857.0, + "step": 500 + }, + { + "entropy": 0.5327546864748001, + "epoch": 1.428849902534113, + "grad_norm": 1.7606678009033203, + "learning_rate": 0.0003787802874228295, + "loss": 2.020283050537109, + "mean_token_accuracy": 0.8638329988718033, + "num_tokens": 1352330.0, + "step": 550 + }, + { + "entropy": 0.5285360223054886, + "epoch": 1.5588044184535412, + "grad_norm": 4.76006555557251, + "learning_rate": 0.00037730824452755275, + "loss": 1.9987391662597656, + "mean_token_accuracy": 0.8644696187973022, + "num_tokens": 1474790.0, + "step": 600 + }, + { + "entropy": 0.5134804363548756, + "epoch": 1.6887589343729694, + "grad_norm": 1.8447264432907104, + "learning_rate": 0.000375452235930833, + "loss": 1.9669386291503905, + "mean_token_accuracy": 0.8659948265552521, + "num_tokens": 1600381.0, + "step": 650 + }, + { + "entropy": 0.5371069309115409, + "epoch": 1.8187134502923976, + "grad_norm": 1.6537392139434814, + "learning_rate": 0.00037321607526553675, + "loss": 2.0411550903320315, + "mean_token_accuracy": 0.8624854254722595, + "num_tokens": 1716827.0, + "step": 700 + }, + { + "entropy": 0.5270501750707627, + "epoch": 1.9486679662118258, + "grad_norm": 2.6990911960601807, + "learning_rate": 0.00037060435728183, + "loss": 2.015792236328125, + "mean_token_accuracy": 0.8631013777852058, + "num_tokens": 1842798.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5477195472384875, + "eval_loss": 0.5585702657699585, + "eval_mean_token_accuracy": 0.8486175815073344, + "eval_num_tokens": 1889564.0, + "eval_runtime": 90.2194, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 770 + }, + { + "entropy": 0.4782189565088282, + "epoch": 2.077972709551657, + "grad_norm": 2.041952610015869, + "learning_rate": 0.0003676224484061175, + "loss": 1.7843829345703126, + "mean_token_accuracy": 0.8739750406250881, + "num_tokens": 1959778.0, + "step": 800 + }, + { + "entropy": 0.4443667846918106, + "epoch": 2.207927225471085, + "grad_norm": 16.27313804626465, + "learning_rate": 0.00036427647571437996, + "loss": 1.6559255981445313, + "mean_token_accuracy": 0.8808386281132699, + "num_tokens": 2087384.0, + "step": 850 + }, + { + "entropy": 0.44861202985048293, + "epoch": 2.3378817413905133, + "grad_norm": 1.648870587348938, + "learning_rate": 0.0003605733143425679, + "loss": 1.677943878173828, + "mean_token_accuracy": 0.879555520415306, + "num_tokens": 2211962.0, + "step": 900 + }, + { + "entropy": 0.4568726105988026, + "epoch": 2.4678362573099415, + "grad_norm": 1.7573126554489136, + "learning_rate": 0.00035652057335991866, + "loss": 1.6760734558105468, + "mean_token_accuracy": 0.8791913360357284, + "num_tokens": 2334838.0, + "step": 950 + }, + { + "entropy": 0.44863338857889173, + "epoch": 2.5977907732293697, + "grad_norm": 1.8639047145843506, + "learning_rate": 0.00035212658013422465, + "loss": 1.6799411010742187, + "mean_token_accuracy": 0.8790675121545791, + "num_tokens": 2461732.0, + "step": 1000 + }, + { + "entropy": 0.4585830120742321, + "epoch": 2.727745289148798, + "grad_norm": 1.9825985431671143, + "learning_rate": 0.0003474003632211781, + "loss": 1.7172026062011718, + "mean_token_accuracy": 0.8782495930790901, + "num_tokens": 2580026.0, + "step": 1050 + }, + { + "entropy": 0.45422692246735097, + "epoch": 2.857699805068226, + "grad_norm": 1.7149962186813354, + "learning_rate": 0.00034235163381294995, + "loss": 1.679084014892578, + "mean_token_accuracy": 0.8795321774482727, + "num_tokens": 2705600.0, + "step": 1100 + }, + { + "entropy": 0.47297614574432373, + "epoch": 2.9876543209876543, + "grad_norm": 1.7435617446899414, + "learning_rate": 0.0003369907657841221, + "loss": 1.7386201477050782, + "mean_token_accuracy": 0.8779115182161331, + "num_tokens": 2822808.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5031588454372607, + "eval_loss": 0.5551120638847351, + "eval_mean_token_accuracy": 0.8531603300227568, + "eval_num_tokens": 2834346.0, + "eval_runtime": 90.2397, + "eval_samples_per_second": 18.362, + "eval_steps_per_second": 2.305, + "step": 1155 + }, + { + "entropy": 0.37655152073457615, + "epoch": 3.116959064327485, + "grad_norm": 1.504384160041809, + "learning_rate": 0.0003313287743759729, + "loss": 1.3653451538085937, + "mean_token_accuracy": 0.8971295344769655, + "num_tokens": 2939773.0, + "step": 1200 + }, + { + "entropy": 0.37069276951253416, + "epoch": 3.246913580246914, + "grad_norm": 1.9665946960449219, + "learning_rate": 0.0003253772935629151, + "loss": 1.3458108520507812, + "mean_token_accuracy": 0.8982205548882485, + "num_tokens": 3063617.0, + "step": 1250 + }, + { + "entropy": 0.37295883789658546, + "epoch": 3.3768680961663415, + "grad_norm": 1.7501362562179565, + "learning_rate": 0.00031914855214759165, + "loss": 1.357562255859375, + "mean_token_accuracy": 0.8977113124728203, + "num_tokens": 3189800.0, + "step": 1300 + }, + { + "entropy": 0.3805788069963455, + "epoch": 3.50682261208577, + "grad_norm": 1.7277154922485352, + "learning_rate": 0.00031265534863374894, + "loss": 1.3735618591308594, + "mean_token_accuracy": 0.8962143072485924, + "num_tokens": 3311908.0, + "step": 1350 + }, + { + "entropy": 0.3840580120682716, + "epoch": 3.636777128005198, + "grad_norm": 2.2338802814483643, + "learning_rate": 0.0003059110249285165, + "loss": 1.3903216552734374, + "mean_token_accuracy": 0.8958476388454437, + "num_tokens": 3432934.0, + "step": 1400 + }, + { + "entropy": 0.37621145449578763, + "epoch": 3.7667316439246266, + "grad_norm": 1.9029661417007446, + "learning_rate": 0.00029892943892812944, + "loss": 1.3776657104492187, + "mean_token_accuracy": 0.8964926180243492, + "num_tokens": 3561408.0, + "step": 1450 + }, + { + "entropy": 0.3784803995490074, + "epoch": 3.8966861598440543, + "grad_norm": 2.089708089828491, + "learning_rate": 0.00029172493604342163, + "loss": 1.3816807556152344, + "mean_token_accuracy": 0.8962833172082901, + "num_tokens": 3684624.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4351254403591156, + "eval_loss": 0.5814722180366516, + "eval_mean_token_accuracy": 0.8530604747625498, + "eval_num_tokens": 3779128.0, + "eval_runtime": 90.2232, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 1540 + }, + { + "entropy": 0.36326556409423677, + "epoch": 4.025990903183885, + "grad_norm": 2.1354947090148926, + "learning_rate": 0.0002843123197235993, + "loss": 1.3295362854003907, + "mean_token_accuracy": 0.8993093811686913, + "num_tokens": 3804993.0, + "step": 1550 + }, + { + "entropy": 0.2879397062957287, + "epoch": 4.155945419103314, + "grad_norm": 2.201097011566162, + "learning_rate": 0.0002767068210388601, + "loss": 1.0272974395751953, + "mean_token_accuracy": 0.9182627710700035, + "num_tokens": 3928162.0, + "step": 1600 + }, + { + "entropy": 0.2848948486149311, + "epoch": 4.2858999350227425, + "grad_norm": 2.01479172706604, + "learning_rate": 0.000268924067384358, + "loss": 1.0278727722167968, + "mean_token_accuracy": 0.9194766515493393, + "num_tokens": 4049012.0, + "step": 1650 + }, + { + "entropy": 0.2940504560619593, + "epoch": 4.41585445094217, + "grad_norm": 2.0893027782440186, + "learning_rate": 0.00026098005036982003, + "loss": 1.0586751556396485, + "mean_token_accuracy": 0.9167885810136795, + "num_tokens": 4167845.0, + "step": 1700 + }, + { + "entropy": 0.293505182415247, + "epoch": 4.545808966861598, + "grad_norm": 1.6346389055252075, + "learning_rate": 0.0002528910929607928, + "loss": 1.0669570922851563, + "mean_token_accuracy": 0.9160876458883286, + "num_tokens": 4287505.0, + "step": 1750 + }, + { + "entropy": 0.2898535231500864, + "epoch": 4.675763482781027, + "grad_norm": 1.6645033359527588, + "learning_rate": 0.0002446738159390364, + "loss": 1.0582612609863282, + "mean_token_accuracy": 0.9177632886171341, + "num_tokens": 4412221.0, + "step": 1800 + }, + { + "entropy": 0.2842763290554285, + "epoch": 4.805717998700455, + "grad_norm": 2.4594268798828125, + "learning_rate": 0.0002363451037509798, + "loss": 1.0467537689208983, + "mean_token_accuracy": 0.9177608361840248, + "num_tokens": 4537178.0, + "step": 1850 + }, + { + "entropy": 0.284430123642087, + "epoch": 4.935672514619883, + "grad_norm": 2.1724514961242676, + "learning_rate": 0.00022792206981441223, + "loss": 1.0753899383544923, + "mean_token_accuracy": 0.915192686021328, + "num_tokens": 4664196.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3632780872285366, + "eval_loss": 0.6438126564025879, + "eval_mean_token_accuracy": 0.8511462942338907, + "eval_num_tokens": 4723910.0, + "eval_runtime": 90.1846, + "eval_samples_per_second": 18.373, + "eval_steps_per_second": 2.306, + "step": 1925 + }, + { + "entropy": 0.23515464736139355, + "epoch": 5.064977257959714, + "grad_norm": 1.651587724685669, + "learning_rate": 0.00021942202135469513, + "loss": 0.8597064971923828, + "mean_token_accuracy": 0.9324622603517082, + "num_tokens": 4789568.0, + "step": 1950 + }, + { + "entropy": 0.1958953895419836, + "epoch": 5.1949317738791425, + "grad_norm": 1.923292636871338, + "learning_rate": 0.0002108624238427481, + "loss": 0.7188112640380859, + "mean_token_accuracy": 0.9416415295004845, + "num_tokens": 4913407.0, + "step": 2000 + }, + { + "entropy": 0.21068542070686816, + "epoch": 5.32488628979857, + "grad_norm": 2.299356460571289, + "learning_rate": 0.0002022608651078804, + "loss": 0.7712985229492187, + "mean_token_accuracy": 0.9386440163850784, + "num_tokens": 5032951.0, + "step": 2050 + }, + { + "entropy": 0.21234643168747425, + "epoch": 5.454840805717999, + "grad_norm": 2.2119295597076416, + "learning_rate": 0.00019363501919920608, + "loss": 0.7650181579589844, + "mean_token_accuracy": 0.938471505343914, + "num_tokens": 5156908.0, + "step": 2100 + }, + { + "entropy": 0.21658269092440605, + "epoch": 5.584795321637427, + "grad_norm": 1.5394288301467896, + "learning_rate": 0.00018500261006989887, + "loss": 0.7784209442138672, + "mean_token_accuracy": 0.9371598136425018, + "num_tokens": 5276087.0, + "step": 2150 + }, + { + "entropy": 0.2045296123996377, + "epoch": 5.714749837556855, + "grad_norm": 1.913680076599121, + "learning_rate": 0.00017638137515890763, + "loss": 0.7638166046142578, + "mean_token_accuracy": 0.9378301629424095, + "num_tokens": 5398787.0, + "step": 2200 + }, + { + "entropy": 0.20917976945638656, + "epoch": 5.844704353476283, + "grad_norm": 2.0847299098968506, + "learning_rate": 0.00016778902894496063, + "loss": 0.7631703186035156, + "mean_token_accuracy": 0.9387557968497277, + "num_tokens": 5522332.0, + "step": 2250 + }, + { + "entropy": 0.22262076318264007, + "epoch": 5.974658869395712, + "grad_norm": 2.1597352027893066, + "learning_rate": 0.0001592432265477485, + "loss": 0.798133773803711, + "mean_token_accuracy": 0.936034984588623, + "num_tokens": 5642361.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.31502799331568754, + "eval_loss": 0.7417300343513489, + "eval_mean_token_accuracy": 0.8477253922476218, + "eval_num_tokens": 5668692.0, + "eval_runtime": 90.4252, + "eval_samples_per_second": 18.325, + "eval_steps_per_second": 2.3, + "step": 2310 + }, + { + "entropy": 0.16796037876725795, + "epoch": 6.1039636127355426, + "grad_norm": 2.2228569984436035, + "learning_rate": 0.00015076152745107442, + "loss": 0.5835284805297851, + "mean_token_accuracy": 0.9529892874123463, + "num_tokens": 5766129.0, + "step": 2350 + }, + { + "entropy": 0.14919219192117453, + "epoch": 6.23391812865497, + "grad_norm": 1.408840298652649, + "learning_rate": 0.00014236135942251215, + "loss": 0.5310631561279296, + "mean_token_accuracy": 0.9586454060673714, + "num_tokens": 5888746.0, + "step": 2400 + }, + { + "entropy": 0.1499051059409976, + "epoch": 6.363872644574399, + "grad_norm": 1.8611102104187012, + "learning_rate": 0.00013405998270370849, + "loss": 0.5127810668945313, + "mean_token_accuracy": 0.9591325157880783, + "num_tokens": 6014455.0, + "step": 2450 + }, + { + "entropy": 0.15334193099290133, + "epoch": 6.493827160493828, + "grad_norm": 1.6051015853881836, + "learning_rate": 0.00012587445454490892, + "loss": 0.5349758529663086, + "mean_token_accuracy": 0.9574431091547012, + "num_tokens": 6141229.0, + "step": 2500 + }, + { + "entropy": 0.15982334002852439, + "epoch": 6.623781676413255, + "grad_norm": 3.7065205574035645, + "learning_rate": 0.00011782159415658008, + "loss": 0.5602469253540039, + "mean_token_accuracy": 0.9555372184515, + "num_tokens": 6257983.0, + "step": 2550 + }, + { + "entropy": 0.16072992872446776, + "epoch": 6.753736192332683, + "grad_norm": 2.282320976257324, + "learning_rate": 0.00010991794815014401, + "loss": 0.5657939910888672, + "mean_token_accuracy": 0.9550630164146423, + "num_tokens": 6376198.0, + "step": 2600 + }, + { + "entropy": 0.1512781011685729, + "epoch": 6.883690708252112, + "grad_norm": 1.3716893196105957, + "learning_rate": 0.00010217975653883603, + "loss": 0.5340792465209961, + "mean_token_accuracy": 0.9578188157081604, + "num_tokens": 6502526.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.2444461930829745, + "eval_loss": 0.8798949718475342, + "eval_mean_token_accuracy": 0.8457763839799625, + "eval_num_tokens": 6613474.0, + "eval_runtime": 90.2868, + "eval_samples_per_second": 18.353, + "eval_steps_per_second": 2.304, + "step": 2695 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.31810912445653e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6144d96a813fa7fd1ee98cb6160f42880081fc05 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.00985279561940916, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..957ae0bacf105fc23db7704b7d2020b1b2b6b335 --- /dev/null +++ b/DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json @@ -0,0 +1,732 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 3080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.353258643448353, + "epoch": 0.1299545159194282, + "grad_norm": 3.010725975036621, + "learning_rate": 4.8475852375026876e-05, + "loss": 5.475971069335937, + "mean_token_accuracy": 0.7263440760970116, + "num_tokens": 128842.0, + "step": 50 + }, + { + "entropy": 0.649170914888382, + "epoch": 0.2599090318388564, + "grad_norm": 1.9099390506744385, + "learning_rate": 9.794100785974817e-05, + "loss": 2.55168701171875, + "mean_token_accuracy": 0.8364580717682838, + "num_tokens": 255497.0, + "step": 100 + }, + { + "entropy": 0.5930788792669773, + "epoch": 0.3898635477582846, + "grad_norm": 2.1239051818847656, + "learning_rate": 0.0001474061633444695, + "loss": 2.3440716552734373, + "mean_token_accuracy": 0.8452290838956833, + "num_tokens": 372014.0, + "step": 150 + }, + { + "entropy": 0.5564522063732147, + "epoch": 0.5198180636777128, + "grad_norm": 411.71807861328125, + "learning_rate": 0.00019687131882919077, + "loss": 2.2838446044921876, + "mean_token_accuracy": 0.8498487600684166, + "num_tokens": 500623.0, + "step": 200 + }, + { + "entropy": 0.5539529167115689, + "epoch": 0.649772579597141, + "grad_norm": 2.1969902515411377, + "learning_rate": 0.0002463364743139121, + "loss": 2.675394287109375, + "mean_token_accuracy": 0.8430694487690925, + "num_tokens": 616223.0, + "step": 250 + }, + { + "entropy": 0.5719467167556286, + "epoch": 0.7797270955165692, + "grad_norm": 1.98796546459198, + "learning_rate": 0.00029580162979863343, + "loss": 2.2434300231933593, + "mean_token_accuracy": 0.851241897046566, + "num_tokens": 737263.0, + "step": 300 + }, + { + "entropy": 0.5502805083990097, + "epoch": 0.9096816114359974, + "grad_norm": 2.0211398601531982, + "learning_rate": 0.0003452667852833547, + "loss": 2.1729367065429686, + "mean_token_accuracy": 0.8554597494006156, + "num_tokens": 861477.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5580813550891784, + "eval_loss": 0.5830356478691101, + "eval_mean_token_accuracy": 0.8432669037809739, + "eval_num_tokens": 944782.0, + "eval_runtime": 90.3664, + "eval_samples_per_second": 18.336, + "eval_steps_per_second": 2.302, + "step": 385 + }, + { + "entropy": 0.5498402091725987, + "epoch": 1.0389863547758285, + "grad_norm": 3.8034188747406006, + "learning_rate": 0.000380866355527619, + "loss": 2.113946990966797, + "mean_token_accuracy": 0.8578129452676629, + "num_tokens": 982803.0, + "step": 400 + }, + { + "entropy": 0.5182110907137394, + "epoch": 1.1689408706952567, + "grad_norm": 2.7830824851989746, + "learning_rate": 0.0003805611725593471, + "loss": 1.9833453369140626, + "mean_token_accuracy": 0.8656822636723518, + "num_tokens": 1105926.0, + "step": 450 + }, + { + "entropy": 0.5260789206624031, + "epoch": 1.2988953866146848, + "grad_norm": 1.7993361949920654, + "learning_rate": 0.0003798653399371568, + "loss": 2.006897430419922, + "mean_token_accuracy": 0.8631055191159248, + "num_tokens": 1229857.0, + "step": 500 + }, + { + "entropy": 0.5327546864748001, + "epoch": 1.428849902534113, + "grad_norm": 1.7606678009033203, + "learning_rate": 0.0003787802874228295, + "loss": 2.020283050537109, + "mean_token_accuracy": 0.8638329988718033, + "num_tokens": 1352330.0, + "step": 550 + }, + { + "entropy": 0.5285360223054886, + "epoch": 1.5588044184535412, + "grad_norm": 4.76006555557251, + "learning_rate": 0.00037730824452755275, + "loss": 1.9987391662597656, + "mean_token_accuracy": 0.8644696187973022, + "num_tokens": 1474790.0, + "step": 600 + }, + { + "entropy": 0.5134804363548756, + "epoch": 1.6887589343729694, + "grad_norm": 1.8447264432907104, + "learning_rate": 0.000375452235930833, + "loss": 1.9669386291503905, + "mean_token_accuracy": 0.8659948265552521, + "num_tokens": 1600381.0, + "step": 650 + }, + { + "entropy": 0.5371069309115409, + "epoch": 1.8187134502923976, + "grad_norm": 1.6537392139434814, + "learning_rate": 0.00037321607526553675, + "loss": 2.0411550903320315, + "mean_token_accuracy": 0.8624854254722595, + "num_tokens": 1716827.0, + "step": 700 + }, + { + "entropy": 0.5270501750707627, + "epoch": 1.9486679662118258, + "grad_norm": 2.6990911960601807, + "learning_rate": 0.00037060435728183, + "loss": 2.015792236328125, + "mean_token_accuracy": 0.8631013777852058, + "num_tokens": 1842798.0, + "step": 750 + }, + { + "epoch": 2.0, + "eval_entropy": 0.5477195472384875, + "eval_loss": 0.5585702657699585, + "eval_mean_token_accuracy": 0.8486175815073344, + "eval_num_tokens": 1889564.0, + "eval_runtime": 90.2194, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 770 + }, + { + "entropy": 0.4782189565088282, + "epoch": 2.077972709551657, + "grad_norm": 2.041952610015869, + "learning_rate": 0.0003676224484061175, + "loss": 1.7843829345703126, + "mean_token_accuracy": 0.8739750406250881, + "num_tokens": 1959778.0, + "step": 800 + }, + { + "entropy": 0.4443667846918106, + "epoch": 2.207927225471085, + "grad_norm": 16.27313804626465, + "learning_rate": 0.00036427647571437996, + "loss": 1.6559255981445313, + "mean_token_accuracy": 0.8808386281132699, + "num_tokens": 2087384.0, + "step": 850 + }, + { + "entropy": 0.44861202985048293, + "epoch": 2.3378817413905133, + "grad_norm": 1.648870587348938, + "learning_rate": 0.0003605733143425679, + "loss": 1.677943878173828, + "mean_token_accuracy": 0.879555520415306, + "num_tokens": 2211962.0, + "step": 900 + }, + { + "entropy": 0.4568726105988026, + "epoch": 2.4678362573099415, + "grad_norm": 1.7573126554489136, + "learning_rate": 0.00035652057335991866, + "loss": 1.6760734558105468, + "mean_token_accuracy": 0.8791913360357284, + "num_tokens": 2334838.0, + "step": 950 + }, + { + "entropy": 0.44863338857889173, + "epoch": 2.5977907732293697, + "grad_norm": 1.8639047145843506, + "learning_rate": 0.00035212658013422465, + "loss": 1.6799411010742187, + "mean_token_accuracy": 0.8790675121545791, + "num_tokens": 2461732.0, + "step": 1000 + }, + { + "entropy": 0.4585830120742321, + "epoch": 2.727745289148798, + "grad_norm": 1.9825985431671143, + "learning_rate": 0.0003474003632211781, + "loss": 1.7172026062011718, + "mean_token_accuracy": 0.8782495930790901, + "num_tokens": 2580026.0, + "step": 1050 + }, + { + "entropy": 0.45422692246735097, + "epoch": 2.857699805068226, + "grad_norm": 1.7149962186813354, + "learning_rate": 0.00034235163381294995, + "loss": 1.679084014892578, + "mean_token_accuracy": 0.8795321774482727, + "num_tokens": 2705600.0, + "step": 1100 + }, + { + "entropy": 0.47297614574432373, + "epoch": 2.9876543209876543, + "grad_norm": 1.7435617446899414, + "learning_rate": 0.0003369907657841221, + "loss": 1.7386201477050782, + "mean_token_accuracy": 0.8779115182161331, + "num_tokens": 2822808.0, + "step": 1150 + }, + { + "epoch": 3.0, + "eval_entropy": 0.5031588454372607, + "eval_loss": 0.5551120638847351, + "eval_mean_token_accuracy": 0.8531603300227568, + "eval_num_tokens": 2834346.0, + "eval_runtime": 90.2397, + "eval_samples_per_second": 18.362, + "eval_steps_per_second": 2.305, + "step": 1155 + }, + { + "entropy": 0.37655152073457615, + "epoch": 3.116959064327485, + "grad_norm": 1.504384160041809, + "learning_rate": 0.0003313287743759729, + "loss": 1.3653451538085937, + "mean_token_accuracy": 0.8971295344769655, + "num_tokens": 2939773.0, + "step": 1200 + }, + { + "entropy": 0.37069276951253416, + "epoch": 3.246913580246914, + "grad_norm": 1.9665946960449219, + "learning_rate": 0.0003253772935629151, + "loss": 1.3458108520507812, + "mean_token_accuracy": 0.8982205548882485, + "num_tokens": 3063617.0, + "step": 1250 + }, + { + "entropy": 0.37295883789658546, + "epoch": 3.3768680961663415, + "grad_norm": 1.7501362562179565, + "learning_rate": 0.00031914855214759165, + "loss": 1.357562255859375, + "mean_token_accuracy": 0.8977113124728203, + "num_tokens": 3189800.0, + "step": 1300 + }, + { + "entropy": 0.3805788069963455, + "epoch": 3.50682261208577, + "grad_norm": 1.7277154922485352, + "learning_rate": 0.00031265534863374894, + "loss": 1.3735618591308594, + "mean_token_accuracy": 0.8962143072485924, + "num_tokens": 3311908.0, + "step": 1350 + }, + { + "entropy": 0.3840580120682716, + "epoch": 3.636777128005198, + "grad_norm": 2.2338802814483643, + "learning_rate": 0.0003059110249285165, + "loss": 1.3903216552734374, + "mean_token_accuracy": 0.8958476388454437, + "num_tokens": 3432934.0, + "step": 1400 + }, + { + "entropy": 0.37621145449578763, + "epoch": 3.7667316439246266, + "grad_norm": 1.9029661417007446, + "learning_rate": 0.00029892943892812944, + "loss": 1.3776657104492187, + "mean_token_accuracy": 0.8964926180243492, + "num_tokens": 3561408.0, + "step": 1450 + }, + { + "entropy": 0.3784803995490074, + "epoch": 3.8966861598440543, + "grad_norm": 2.089708089828491, + "learning_rate": 0.00029172493604342163, + "loss": 1.3816807556152344, + "mean_token_accuracy": 0.8962833172082901, + "num_tokens": 3684624.0, + "step": 1500 + }, + { + "epoch": 4.0, + "eval_entropy": 0.4351254403591156, + "eval_loss": 0.5814722180366516, + "eval_mean_token_accuracy": 0.8530604747625498, + "eval_num_tokens": 3779128.0, + "eval_runtime": 90.2232, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 2.305, + "step": 1540 + }, + { + "entropy": 0.36326556409423677, + "epoch": 4.025990903183885, + "grad_norm": 2.1354947090148926, + "learning_rate": 0.0002843123197235993, + "loss": 1.3295362854003907, + "mean_token_accuracy": 0.8993093811686913, + "num_tokens": 3804993.0, + "step": 1550 + }, + { + "entropy": 0.2879397062957287, + "epoch": 4.155945419103314, + "grad_norm": 2.201097011566162, + "learning_rate": 0.0002767068210388601, + "loss": 1.0272974395751953, + "mean_token_accuracy": 0.9182627710700035, + "num_tokens": 3928162.0, + "step": 1600 + }, + { + "entropy": 0.2848948486149311, + "epoch": 4.2858999350227425, + "grad_norm": 2.01479172706604, + "learning_rate": 0.000268924067384358, + "loss": 1.0278727722167968, + "mean_token_accuracy": 0.9194766515493393, + "num_tokens": 4049012.0, + "step": 1650 + }, + { + "entropy": 0.2940504560619593, + "epoch": 4.41585445094217, + "grad_norm": 2.0893027782440186, + "learning_rate": 0.00026098005036982003, + "loss": 1.0586751556396485, + "mean_token_accuracy": 0.9167885810136795, + "num_tokens": 4167845.0, + "step": 1700 + }, + { + "entropy": 0.293505182415247, + "epoch": 4.545808966861598, + "grad_norm": 1.6346389055252075, + "learning_rate": 0.0002528910929607928, + "loss": 1.0669570922851563, + "mean_token_accuracy": 0.9160876458883286, + "num_tokens": 4287505.0, + "step": 1750 + }, + { + "entropy": 0.2898535231500864, + "epoch": 4.675763482781027, + "grad_norm": 1.6645033359527588, + "learning_rate": 0.0002446738159390364, + "loss": 1.0582612609863282, + "mean_token_accuracy": 0.9177632886171341, + "num_tokens": 4412221.0, + "step": 1800 + }, + { + "entropy": 0.2842763290554285, + "epoch": 4.805717998700455, + "grad_norm": 2.4594268798828125, + "learning_rate": 0.0002363451037509798, + "loss": 1.0467537689208983, + "mean_token_accuracy": 0.9177608361840248, + "num_tokens": 4537178.0, + "step": 1850 + }, + { + "entropy": 0.284430123642087, + "epoch": 4.935672514619883, + "grad_norm": 2.1724514961242676, + "learning_rate": 0.00022792206981441223, + "loss": 1.0753899383544923, + "mean_token_accuracy": 0.915192686021328, + "num_tokens": 4664196.0, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_entropy": 0.3632780872285366, + "eval_loss": 0.6438126564025879, + "eval_mean_token_accuracy": 0.8511462942338907, + "eval_num_tokens": 4723910.0, + "eval_runtime": 90.1846, + "eval_samples_per_second": 18.373, + "eval_steps_per_second": 2.306, + "step": 1925 + }, + { + "entropy": 0.23515464736139355, + "epoch": 5.064977257959714, + "grad_norm": 1.651587724685669, + "learning_rate": 0.00021942202135469513, + "loss": 0.8597064971923828, + "mean_token_accuracy": 0.9324622603517082, + "num_tokens": 4789568.0, + "step": 1950 + }, + { + "entropy": 0.1958953895419836, + "epoch": 5.1949317738791425, + "grad_norm": 1.923292636871338, + "learning_rate": 0.0002108624238427481, + "loss": 0.7188112640380859, + "mean_token_accuracy": 0.9416415295004845, + "num_tokens": 4913407.0, + "step": 2000 + }, + { + "entropy": 0.21068542070686816, + "epoch": 5.32488628979857, + "grad_norm": 2.299356460571289, + "learning_rate": 0.0002022608651078804, + "loss": 0.7712985229492187, + "mean_token_accuracy": 0.9386440163850784, + "num_tokens": 5032951.0, + "step": 2050 + }, + { + "entropy": 0.21234643168747425, + "epoch": 5.454840805717999, + "grad_norm": 2.2119295597076416, + "learning_rate": 0.00019363501919920608, + "loss": 0.7650181579589844, + "mean_token_accuracy": 0.938471505343914, + "num_tokens": 5156908.0, + "step": 2100 + }, + { + "entropy": 0.21658269092440605, + "epoch": 5.584795321637427, + "grad_norm": 1.5394288301467896, + "learning_rate": 0.00018500261006989887, + "loss": 0.7784209442138672, + "mean_token_accuracy": 0.9371598136425018, + "num_tokens": 5276087.0, + "step": 2150 + }, + { + "entropy": 0.2045296123996377, + "epoch": 5.714749837556855, + "grad_norm": 1.913680076599121, + "learning_rate": 0.00017638137515890763, + "loss": 0.7638166046142578, + "mean_token_accuracy": 0.9378301629424095, + "num_tokens": 5398787.0, + "step": 2200 + }, + { + "entropy": 0.20917976945638656, + "epoch": 5.844704353476283, + "grad_norm": 2.0847299098968506, + "learning_rate": 0.00016778902894496063, + "loss": 0.7631703186035156, + "mean_token_accuracy": 0.9387557968497277, + "num_tokens": 5522332.0, + "step": 2250 + }, + { + "entropy": 0.22262076318264007, + "epoch": 5.974658869395712, + "grad_norm": 2.1597352027893066, + "learning_rate": 0.0001592432265477485, + "loss": 0.798133773803711, + "mean_token_accuracy": 0.936034984588623, + "num_tokens": 5642361.0, + "step": 2300 + }, + { + "epoch": 6.0, + "eval_entropy": 0.31502799331568754, + "eval_loss": 0.7417300343513489, + "eval_mean_token_accuracy": 0.8477253922476218, + "eval_num_tokens": 5668692.0, + "eval_runtime": 90.4252, + "eval_samples_per_second": 18.325, + "eval_steps_per_second": 2.3, + "step": 2310 + }, + { + "entropy": 0.16796037876725795, + "epoch": 6.1039636127355426, + "grad_norm": 2.2228569984436035, + "learning_rate": 0.00015076152745107442, + "loss": 0.5835284805297851, + "mean_token_accuracy": 0.9529892874123463, + "num_tokens": 5766129.0, + "step": 2350 + }, + { + "entropy": 0.14919219192117453, + "epoch": 6.23391812865497, + "grad_norm": 1.408840298652649, + "learning_rate": 0.00014236135942251215, + "loss": 0.5310631561279296, + "mean_token_accuracy": 0.9586454060673714, + "num_tokens": 5888746.0, + "step": 2400 + }, + { + "entropy": 0.1499051059409976, + "epoch": 6.363872644574399, + "grad_norm": 1.8611102104187012, + "learning_rate": 0.00013405998270370849, + "loss": 0.5127810668945313, + "mean_token_accuracy": 0.9591325157880783, + "num_tokens": 6014455.0, + "step": 2450 + }, + { + "entropy": 0.15334193099290133, + "epoch": 6.493827160493828, + "grad_norm": 1.6051015853881836, + "learning_rate": 0.00012587445454490892, + "loss": 0.5349758529663086, + "mean_token_accuracy": 0.9574431091547012, + "num_tokens": 6141229.0, + "step": 2500 + }, + { + "entropy": 0.15982334002852439, + "epoch": 6.623781676413255, + "grad_norm": 3.7065205574035645, + "learning_rate": 0.00011782159415658008, + "loss": 0.5602469253540039, + "mean_token_accuracy": 0.9555372184515, + "num_tokens": 6257983.0, + "step": 2550 + }, + { + "entropy": 0.16072992872446776, + "epoch": 6.753736192332683, + "grad_norm": 2.282320976257324, + "learning_rate": 0.00010991794815014401, + "loss": 0.5657939910888672, + "mean_token_accuracy": 0.9550630164146423, + "num_tokens": 6376198.0, + "step": 2600 + }, + { + "entropy": 0.1512781011685729, + "epoch": 6.883690708252112, + "grad_norm": 1.3716893196105957, + "learning_rate": 0.00010217975653883603, + "loss": 0.5340792465209961, + "mean_token_accuracy": 0.9578188157081604, + "num_tokens": 6502526.0, + "step": 2650 + }, + { + "epoch": 7.0, + "eval_entropy": 0.2444461930829745, + "eval_loss": 0.8798949718475342, + "eval_mean_token_accuracy": 0.8457763839799625, + "eval_num_tokens": 6613474.0, + "eval_runtime": 90.2868, + "eval_samples_per_second": 18.353, + "eval_steps_per_second": 2.304, + "step": 2695 + }, + { + "entropy": 0.1444593005668578, + "epoch": 7.012995451591943, + "grad_norm": 1.0965569019317627, + "learning_rate": 9.462291936854386e-05, + "loss": 0.511833839416504, + "mean_token_accuracy": 0.9595773016388093, + "num_tokens": 6626464.0, + "step": 2700 + }, + { + "entropy": 0.10985541097819805, + "epoch": 7.142949967511371, + "grad_norm": 1.8079149723052979, + "learning_rate": 8.726296404719584e-05, + "loss": 0.3876673126220703, + "mean_token_accuracy": 0.9704919803142548, + "num_tokens": 6746276.0, + "step": 2750 + }, + { + "entropy": 0.11304264679551125, + "epoch": 7.272904483430799, + "grad_norm": 1.5228444337844849, + "learning_rate": 8.01150134398253e-05, + "loss": 0.39335052490234373, + "mean_token_accuracy": 0.9695766788721084, + "num_tokens": 6868131.0, + "step": 2800 + }, + { + "entropy": 0.11066193280741572, + "epoch": 7.402858999350228, + "grad_norm": 2.265174388885498, + "learning_rate": 7.319375479487112e-05, + "loss": 0.38289966583251955, + "mean_token_accuracy": 0.9707033503055572, + "num_tokens": 6993803.0, + "step": 2850 + }, + { + "entropy": 0.12022399662062526, + "epoch": 7.532813515269655, + "grad_norm": 1.0657345056533813, + "learning_rate": 6.65134095655596e-05, + "loss": 0.4089087677001953, + "mean_token_accuracy": 0.9689779531955719, + "num_tokens": 7113063.0, + "step": 2900 + }, + { + "entropy": 0.11429863104596734, + "epoch": 7.662768031189084, + "grad_norm": 1.3440358638763428, + "learning_rate": 6.008770418837973e-05, + "loss": 0.3935198593139648, + "mean_token_accuracy": 0.9698223957419395, + "num_tokens": 7237174.0, + "step": 2950 + }, + { + "entropy": 0.11748226622119545, + "epoch": 7.792722547108512, + "grad_norm": 1.4607034921646118, + "learning_rate": 5.3929841878693804e-05, + "loss": 0.40399799346923826, + "mean_token_accuracy": 0.9695871344208717, + "num_tokens": 7357301.0, + "step": 3000 + }, + { + "entropy": 0.11790506653487683, + "epoch": 7.92267706302794, + "grad_norm": 1.4574708938598633, + "learning_rate": 4.805247550143646e-05, + "loss": 0.4049314880371094, + "mean_token_accuracy": 0.9693469110131264, + "num_tokens": 7482431.0, + "step": 3050 + }, + { + "epoch": 8.0, + "eval_entropy": 0.2104659411483086, + "eval_loss": 0.9939886927604675, + "eval_mean_token_accuracy": 0.8444042455118436, + "eval_num_tokens": 7558256.0, + "eval_runtime": 90.3118, + "eval_samples_per_second": 18.348, + "eval_steps_per_second": 2.303, + "step": 3080 + } + ], + "logging_steps": 50, + "max_steps": 3850, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.648642717750723e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ef7cbd509e64b6efe77e24be8cbe43639e5af314 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md @@ -0,0 +1,58 @@ +--- +base_model: google/gemma-4-31B +library_name: transformers +model_name: gemma-4-31B_original_features_structural_train_original_features_structural_test1 +tags: +- generated_from_trainer +- sft +- trl +licence: license +--- + +# Model Card for gemma-4-31B_original_features_structural_train_original_features_structural_test1 + +This model is a fine-tuned version of [google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/rfqns0wc) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.6.1 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2148faba04b3eea9d8bc79cdd2f52c92b8cda9e7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.015034304668777832, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5f86489f7a280a3e03cdc012c40d12d1f59c248d --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json @@ -0,0 +1,287 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1122, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3355020767450332, + "epoch": 0.13386880856760375, + "grad_norm": 3.2956597805023193, + "learning_rate": 1.628530639938585e-05, + "loss": 5.349910278320312, + "mean_token_accuracy": 0.7383818039298058, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5958842460811138, + "epoch": 0.2677376171352075, + "grad_norm": 2.5947492122650146, + "learning_rate": 3.290296599059591e-05, + "loss": 2.312855072021484, + "mean_token_accuracy": 0.8520967712998391, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5190362003445625, + "epoch": 0.40160642570281124, + "grad_norm": 1.5038394927978516, + "learning_rate": 4.9520625581805955e-05, + "loss": 2.0574468994140624, + "mean_token_accuracy": 0.8657039344310761, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4922871346771717, + "epoch": 0.535475234270415, + "grad_norm": 1.645923137664795, + "learning_rate": 6.613828517301602e-05, + "loss": 1.916438446044922, + "mean_token_accuracy": 0.8717759534716606, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.491110111027956, + "epoch": 0.6693440428380187, + "grad_norm": 1.866817593574524, + "learning_rate": 8.275594476422607e-05, + "loss": 1.9421713256835937, + "mean_token_accuracy": 0.8710730043053627, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.47134352535009383, + "epoch": 0.8032128514056225, + "grad_norm": 117.62409210205078, + "learning_rate": 9.937360435543611e-05, + "loss": 1.9768324279785157, + "mean_token_accuracy": 0.8741078078746796, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.4820582258701325, + "epoch": 0.9370816599732262, + "grad_norm": 2.3274827003479004, + "learning_rate": 0.00011599126394664616, + "loss": 2.2025875854492187, + "mean_token_accuracy": 0.8697148504853248, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5010400542616844, + "eval_loss": 0.5114277601242065, + "eval_mean_token_accuracy": 0.8587275749444961, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.5515, + "eval_samples_per_second": 16.561, + "eval_steps_per_second": 2.071, + "step": 374 + }, + { + "entropy": 0.4708875769918615, + "epoch": 1.069611780455154, + "grad_norm": 3.3712940216064453, + "learning_rate": 0.00012428317596508976, + "loss": 1.83294189453125, + "mean_token_accuracy": 0.8772370366737096, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.44804590195417404, + "epoch": 1.2034805890227578, + "grad_norm": 1.4833389520645142, + "learning_rate": 0.00012414788900475706, + "loss": 1.7768891906738282, + "mean_token_accuracy": 0.8791097947955131, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.4510513086616993, + "epoch": 1.3373493975903614, + "grad_norm": 2.814790964126587, + "learning_rate": 0.00012387760965418496, + "loss": 1.7745071411132813, + "mean_token_accuracy": 0.8813075706362724, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.4479117552936077, + "epoch": 1.4712182061579653, + "grad_norm": 1.855610728263855, + "learning_rate": 0.00012347292641217135, + "loss": 1.7583291625976563, + "mean_token_accuracy": 0.8815277495980263, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4380264139175415, + "epoch": 1.605087014725569, + "grad_norm": 1.383190631866455, + "learning_rate": 0.00012293472042483757, + "loss": 1.7229583740234375, + "mean_token_accuracy": 0.8832098203897476, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.4342571949958801, + "epoch": 1.7389558232931726, + "grad_norm": 1.4977834224700928, + "learning_rate": 0.00012226416356704526, + "loss": 1.7174737548828125, + "mean_token_accuracy": 0.8834967383742333, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.42700962007045745, + "epoch": 1.8728246318607764, + "grad_norm": 1.6156537532806396, + "learning_rate": 0.00012146271589078838, + "loss": 1.682061767578125, + "mean_token_accuracy": 0.8858474844694137, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4838937771320343, + "eval_loss": 0.4826815128326416, + "eval_mean_token_accuracy": 0.8682844692468643, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5071, + "eval_samples_per_second": 16.569, + "eval_steps_per_second": 2.072, + "step": 748 + }, + { + "entropy": 0.4378527848407476, + "epoch": 2.005354752342704, + "grad_norm": 1.400229573249817, + "learning_rate": 0.0001205321224461161, + "loss": 1.7096096801757812, + "mean_token_accuracy": 0.8838462468349573, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.3559799794852734, + "epoch": 2.139223560910308, + "grad_norm": 1.7168083190917969, + "learning_rate": 0.0001194744094815093, + "loss": 1.3893603515625, + "mean_token_accuracy": 0.9004731178283691, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3671448823064566, + "epoch": 2.2730923694779115, + "grad_norm": 1.9720135927200317, + "learning_rate": 0.00011829188003198282, + "loss": 1.429988555908203, + "mean_token_accuracy": 0.8970818132162094, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3597494306415319, + "epoch": 2.4069611780455156, + "grad_norm": 1.4947372674942017, + "learning_rate": 0.00011698710890452068, + "loss": 1.418173828125, + "mean_token_accuracy": 0.8994651186466217, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.36254502907395364, + "epoch": 2.540829986613119, + "grad_norm": 1.6768454313278198, + "learning_rate": 0.00011556293707176242, + "loss": 1.4158590698242188, + "mean_token_accuracy": 0.8995477721095085, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.36290778368711474, + "epoch": 2.674698795180723, + "grad_norm": 1.6033697128295898, + "learning_rate": 0.00011402246548614765, + "loss": 1.4300469970703125, + "mean_token_accuracy": 0.8986452376842499, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.3635872249305248, + "epoch": 2.8085676037483265, + "grad_norm": 1.546893835067749, + "learning_rate": 0.00011236904832798785, + "loss": 1.42587646484375, + "mean_token_accuracy": 0.9003903394937516, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.36871150620281695, + "epoch": 2.9424364123159306, + "grad_norm": 1.2951405048370361, + "learning_rate": 0.0001106062857021667, + "loss": 1.448046875, + "mean_token_accuracy": 0.8967258337140084, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4225208269059658, + "eval_loss": 0.489418089389801, + "eval_mean_token_accuracy": 0.8697815361618996, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.4058, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.075, + "step": 1122 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.979346498185751e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2148faba04b3eea9d8bc79cdd2f52c92b8cda9e7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.015034304668777832, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0b1dfc79f7cdd73124f909152ced65a01eb82b33 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json @@ -0,0 +1,368 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1496, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3355020767450332, + "epoch": 0.13386880856760375, + "grad_norm": 3.2956597805023193, + "learning_rate": 1.628530639938585e-05, + "loss": 5.349910278320312, + "mean_token_accuracy": 0.7383818039298058, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5958842460811138, + "epoch": 0.2677376171352075, + "grad_norm": 2.5947492122650146, + "learning_rate": 3.290296599059591e-05, + "loss": 2.312855072021484, + "mean_token_accuracy": 0.8520967712998391, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5190362003445625, + "epoch": 0.40160642570281124, + "grad_norm": 1.5038394927978516, + "learning_rate": 4.9520625581805955e-05, + "loss": 2.0574468994140624, + "mean_token_accuracy": 0.8657039344310761, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4922871346771717, + "epoch": 0.535475234270415, + "grad_norm": 1.645923137664795, + "learning_rate": 6.613828517301602e-05, + "loss": 1.916438446044922, + "mean_token_accuracy": 0.8717759534716606, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.491110111027956, + "epoch": 0.6693440428380187, + "grad_norm": 1.866817593574524, + "learning_rate": 8.275594476422607e-05, + "loss": 1.9421713256835937, + "mean_token_accuracy": 0.8710730043053627, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.47134352535009383, + "epoch": 0.8032128514056225, + "grad_norm": 117.62409210205078, + "learning_rate": 9.937360435543611e-05, + "loss": 1.9768324279785157, + "mean_token_accuracy": 0.8741078078746796, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.4820582258701325, + "epoch": 0.9370816599732262, + "grad_norm": 2.3274827003479004, + "learning_rate": 0.00011599126394664616, + "loss": 2.2025875854492187, + "mean_token_accuracy": 0.8697148504853248, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5010400542616844, + "eval_loss": 0.5114277601242065, + "eval_mean_token_accuracy": 0.8587275749444961, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.5515, + "eval_samples_per_second": 16.561, + "eval_steps_per_second": 2.071, + "step": 374 + }, + { + "entropy": 0.4708875769918615, + "epoch": 1.069611780455154, + "grad_norm": 3.3712940216064453, + "learning_rate": 0.00012428317596508976, + "loss": 1.83294189453125, + "mean_token_accuracy": 0.8772370366737096, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.44804590195417404, + "epoch": 1.2034805890227578, + "grad_norm": 1.4833389520645142, + "learning_rate": 0.00012414788900475706, + "loss": 1.7768891906738282, + "mean_token_accuracy": 0.8791097947955131, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.4510513086616993, + "epoch": 1.3373493975903614, + "grad_norm": 2.814790964126587, + "learning_rate": 0.00012387760965418496, + "loss": 1.7745071411132813, + "mean_token_accuracy": 0.8813075706362724, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.4479117552936077, + "epoch": 1.4712182061579653, + "grad_norm": 1.855610728263855, + "learning_rate": 0.00012347292641217135, + "loss": 1.7583291625976563, + "mean_token_accuracy": 0.8815277495980263, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4380264139175415, + "epoch": 1.605087014725569, + "grad_norm": 1.383190631866455, + "learning_rate": 0.00012293472042483757, + "loss": 1.7229583740234375, + "mean_token_accuracy": 0.8832098203897476, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.4342571949958801, + "epoch": 1.7389558232931726, + "grad_norm": 1.4977834224700928, + "learning_rate": 0.00012226416356704526, + "loss": 1.7174737548828125, + "mean_token_accuracy": 0.8834967383742333, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.42700962007045745, + "epoch": 1.8728246318607764, + "grad_norm": 1.6156537532806396, + "learning_rate": 0.00012146271589078838, + "loss": 1.682061767578125, + "mean_token_accuracy": 0.8858474844694137, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4838937771320343, + "eval_loss": 0.4826815128326416, + "eval_mean_token_accuracy": 0.8682844692468643, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5071, + "eval_samples_per_second": 16.569, + "eval_steps_per_second": 2.072, + "step": 748 + }, + { + "entropy": 0.4378527848407476, + "epoch": 2.005354752342704, + "grad_norm": 1.400229573249817, + "learning_rate": 0.0001205321224461161, + "loss": 1.7096096801757812, + "mean_token_accuracy": 0.8838462468349573, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.3559799794852734, + "epoch": 2.139223560910308, + "grad_norm": 1.7168083190917969, + "learning_rate": 0.0001194744094815093, + "loss": 1.3893603515625, + "mean_token_accuracy": 0.9004731178283691, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3671448823064566, + "epoch": 2.2730923694779115, + "grad_norm": 1.9720135927200317, + "learning_rate": 0.00011829188003198282, + "loss": 1.429988555908203, + "mean_token_accuracy": 0.8970818132162094, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3597494306415319, + "epoch": 2.4069611780455156, + "grad_norm": 1.4947372674942017, + "learning_rate": 0.00011698710890452068, + "loss": 1.418173828125, + "mean_token_accuracy": 0.8994651186466217, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.36254502907395364, + "epoch": 2.540829986613119, + "grad_norm": 1.6768454313278198, + "learning_rate": 0.00011556293707176242, + "loss": 1.4158590698242188, + "mean_token_accuracy": 0.8995477721095085, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.36290778368711474, + "epoch": 2.674698795180723, + "grad_norm": 1.6033697128295898, + "learning_rate": 0.00011402246548614765, + "loss": 1.4300469970703125, + "mean_token_accuracy": 0.8986452376842499, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.3635872249305248, + "epoch": 2.8085676037483265, + "grad_norm": 1.546893835067749, + "learning_rate": 0.00011236904832798785, + "loss": 1.42587646484375, + "mean_token_accuracy": 0.9003903394937516, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.36871150620281695, + "epoch": 2.9424364123159306, + "grad_norm": 1.2951405048370361, + "learning_rate": 0.0001106062857021667, + "loss": 1.448046875, + "mean_token_accuracy": 0.8967258337140084, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4225208269059658, + "eval_loss": 0.489418089389801, + "eval_mean_token_accuracy": 0.8697815361618996, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.4058, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.075, + "step": 1122 + }, + { + "entropy": 0.3120347365285411, + "epoch": 3.074966532797858, + "grad_norm": 1.639520287513733, + "learning_rate": 0.00010873801579937106, + "loss": 1.1941973876953125, + "mean_token_accuracy": 0.9117801315856703, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.28257040068507194, + "epoch": 3.208835341365462, + "grad_norm": 1.7459681034088135, + "learning_rate": 0.00010676830653892058, + "loss": 1.0850601196289062, + "mean_token_accuracy": 0.9177472350001336, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.27802520349621773, + "epoch": 3.3427041499330654, + "grad_norm": 1.5176103115081787, + "learning_rate": 0.00010470144671139238, + "loss": 1.0840838623046876, + "mean_token_accuracy": 0.9179763168096542, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.280417420566082, + "epoch": 3.4765729585006695, + "grad_norm": 1.3774974346160889, + "learning_rate": 0.00010254193664032686, + "loss": 1.0911756896972655, + "mean_token_accuracy": 0.9162956389784813, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.2834589210152626, + "epoch": 3.610441767068273, + "grad_norm": 1.5929396152496338, + "learning_rate": 0.00010029447838334742, + "loss": 1.0985262298583984, + "mean_token_accuracy": 0.9174074530601501, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.282296127229929, + "epoch": 3.7443105756358768, + "grad_norm": 1.50350022315979, + "learning_rate": 9.796396549403e-05, + "loss": 1.101386260986328, + "mean_token_accuracy": 0.9168545073270797, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.279728781580925, + "epoch": 3.878179384203481, + "grad_norm": 1.4728187322616577, + "learning_rate": 9.555547236681456e-05, + "loss": 1.0859880065917968, + "mean_token_accuracy": 0.9178367125988006, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34304031178355215, + "eval_loss": 0.5295785665512085, + "eval_mean_token_accuracy": 0.8698753178119659, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.3616, + "eval_samples_per_second": 16.594, + "eval_steps_per_second": 2.076, + "step": 1496 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1971161045794035e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2148faba04b3eea9d8bc79cdd2f52c92b8cda9e7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.015034304668777832, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..02c25f2f0a17ce8a75e4dd95cf316c9e758e6736 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json @@ -0,0 +1,459 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1870, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3355020767450332, + "epoch": 0.13386880856760375, + "grad_norm": 3.2956597805023193, + "learning_rate": 1.628530639938585e-05, + "loss": 5.349910278320312, + "mean_token_accuracy": 0.7383818039298058, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5958842460811138, + "epoch": 0.2677376171352075, + "grad_norm": 2.5947492122650146, + "learning_rate": 3.290296599059591e-05, + "loss": 2.312855072021484, + "mean_token_accuracy": 0.8520967712998391, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5190362003445625, + "epoch": 0.40160642570281124, + "grad_norm": 1.5038394927978516, + "learning_rate": 4.9520625581805955e-05, + "loss": 2.0574468994140624, + "mean_token_accuracy": 0.8657039344310761, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4922871346771717, + "epoch": 0.535475234270415, + "grad_norm": 1.645923137664795, + "learning_rate": 6.613828517301602e-05, + "loss": 1.916438446044922, + "mean_token_accuracy": 0.8717759534716606, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.491110111027956, + "epoch": 0.6693440428380187, + "grad_norm": 1.866817593574524, + "learning_rate": 8.275594476422607e-05, + "loss": 1.9421713256835937, + "mean_token_accuracy": 0.8710730043053627, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.47134352535009383, + "epoch": 0.8032128514056225, + "grad_norm": 117.62409210205078, + "learning_rate": 9.937360435543611e-05, + "loss": 1.9768324279785157, + "mean_token_accuracy": 0.8741078078746796, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.4820582258701325, + "epoch": 0.9370816599732262, + "grad_norm": 2.3274827003479004, + "learning_rate": 0.00011599126394664616, + "loss": 2.2025875854492187, + "mean_token_accuracy": 0.8697148504853248, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5010400542616844, + "eval_loss": 0.5114277601242065, + "eval_mean_token_accuracy": 0.8587275749444961, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.5515, + "eval_samples_per_second": 16.561, + "eval_steps_per_second": 2.071, + "step": 374 + }, + { + "entropy": 0.4708875769918615, + "epoch": 1.069611780455154, + "grad_norm": 3.3712940216064453, + "learning_rate": 0.00012428317596508976, + "loss": 1.83294189453125, + "mean_token_accuracy": 0.8772370366737096, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.44804590195417404, + "epoch": 1.2034805890227578, + "grad_norm": 1.4833389520645142, + "learning_rate": 0.00012414788900475706, + "loss": 1.7768891906738282, + "mean_token_accuracy": 0.8791097947955131, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.4510513086616993, + "epoch": 1.3373493975903614, + "grad_norm": 2.814790964126587, + "learning_rate": 0.00012387760965418496, + "loss": 1.7745071411132813, + "mean_token_accuracy": 0.8813075706362724, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.4479117552936077, + "epoch": 1.4712182061579653, + "grad_norm": 1.855610728263855, + "learning_rate": 0.00012347292641217135, + "loss": 1.7583291625976563, + "mean_token_accuracy": 0.8815277495980263, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4380264139175415, + "epoch": 1.605087014725569, + "grad_norm": 1.383190631866455, + "learning_rate": 0.00012293472042483757, + "loss": 1.7229583740234375, + "mean_token_accuracy": 0.8832098203897476, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.4342571949958801, + "epoch": 1.7389558232931726, + "grad_norm": 1.4977834224700928, + "learning_rate": 0.00012226416356704526, + "loss": 1.7174737548828125, + "mean_token_accuracy": 0.8834967383742333, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.42700962007045745, + "epoch": 1.8728246318607764, + "grad_norm": 1.6156537532806396, + "learning_rate": 0.00012146271589078838, + "loss": 1.682061767578125, + "mean_token_accuracy": 0.8858474844694137, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4838937771320343, + "eval_loss": 0.4826815128326416, + "eval_mean_token_accuracy": 0.8682844692468643, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5071, + "eval_samples_per_second": 16.569, + "eval_steps_per_second": 2.072, + "step": 748 + }, + { + "entropy": 0.4378527848407476, + "epoch": 2.005354752342704, + "grad_norm": 1.400229573249817, + "learning_rate": 0.0001205321224461161, + "loss": 1.7096096801757812, + "mean_token_accuracy": 0.8838462468349573, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.3559799794852734, + "epoch": 2.139223560910308, + "grad_norm": 1.7168083190917969, + "learning_rate": 0.0001194744094815093, + "loss": 1.3893603515625, + "mean_token_accuracy": 0.9004731178283691, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3671448823064566, + "epoch": 2.2730923694779115, + "grad_norm": 1.9720135927200317, + "learning_rate": 0.00011829188003198282, + "loss": 1.429988555908203, + "mean_token_accuracy": 0.8970818132162094, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3597494306415319, + "epoch": 2.4069611780455156, + "grad_norm": 1.4947372674942017, + "learning_rate": 0.00011698710890452068, + "loss": 1.418173828125, + "mean_token_accuracy": 0.8994651186466217, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.36254502907395364, + "epoch": 2.540829986613119, + "grad_norm": 1.6768454313278198, + "learning_rate": 0.00011556293707176242, + "loss": 1.4158590698242188, + "mean_token_accuracy": 0.8995477721095085, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.36290778368711474, + "epoch": 2.674698795180723, + "grad_norm": 1.6033697128295898, + "learning_rate": 0.00011402246548614765, + "loss": 1.4300469970703125, + "mean_token_accuracy": 0.8986452376842499, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.3635872249305248, + "epoch": 2.8085676037483265, + "grad_norm": 1.546893835067749, + "learning_rate": 0.00011236904832798785, + "loss": 1.42587646484375, + "mean_token_accuracy": 0.9003903394937516, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.36871150620281695, + "epoch": 2.9424364123159306, + "grad_norm": 1.2951405048370361, + "learning_rate": 0.0001106062857021667, + "loss": 1.448046875, + "mean_token_accuracy": 0.8967258337140084, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4225208269059658, + "eval_loss": 0.489418089389801, + "eval_mean_token_accuracy": 0.8697815361618996, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.4058, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.075, + "step": 1122 + }, + { + "entropy": 0.3120347365285411, + "epoch": 3.074966532797858, + "grad_norm": 1.639520287513733, + "learning_rate": 0.00010873801579937106, + "loss": 1.1941973876953125, + "mean_token_accuracy": 0.9117801315856703, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.28257040068507194, + "epoch": 3.208835341365462, + "grad_norm": 1.7459681034088135, + "learning_rate": 0.00010676830653892058, + "loss": 1.0850601196289062, + "mean_token_accuracy": 0.9177472350001336, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.27802520349621773, + "epoch": 3.3427041499330654, + "grad_norm": 1.5176103115081787, + "learning_rate": 0.00010470144671139238, + "loss": 1.0840838623046876, + "mean_token_accuracy": 0.9179763168096542, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.280417420566082, + "epoch": 3.4765729585006695, + "grad_norm": 1.3774974346160889, + "learning_rate": 0.00010254193664032686, + "loss": 1.0911756896972655, + "mean_token_accuracy": 0.9162956389784813, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.2834589210152626, + "epoch": 3.610441767068273, + "grad_norm": 1.5929396152496338, + "learning_rate": 0.00010029447838334742, + "loss": 1.0985262298583984, + "mean_token_accuracy": 0.9174074530601501, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.282296127229929, + "epoch": 3.7443105756358768, + "grad_norm": 1.50350022315979, + "learning_rate": 9.796396549403e-05, + "loss": 1.101386260986328, + "mean_token_accuracy": 0.9168545073270797, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.279728781580925, + "epoch": 3.878179384203481, + "grad_norm": 1.4728187322616577, + "learning_rate": 9.555547236681456e-05, + "loss": 1.0859880065917968, + "mean_token_accuracy": 0.9178367125988006, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34304031178355215, + "eval_loss": 0.5295785665512085, + "eval_mean_token_accuracy": 0.8698753178119659, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.3616, + "eval_samples_per_second": 16.594, + "eval_steps_per_second": 2.076, + "step": 1496 + }, + { + "entropy": 0.27893446536377225, + "epoch": 4.010709504685408, + "grad_norm": 1.545491337776184, + "learning_rate": 9.30742431881587e-05, + "loss": 1.0577442169189453, + "mean_token_accuracy": 0.9191552999645772, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.19769302535802125, + "epoch": 4.144578313253012, + "grad_norm": 2.10296893119812, + "learning_rate": 9.052568051799083e-05, + "loss": 0.7461458587646485, + "mean_token_accuracy": 0.9415343621373177, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.1981763695180416, + "epoch": 4.278447121820616, + "grad_norm": 2.067410945892334, + "learning_rate": 8.791533352632524e-05, + "loss": 0.7580889892578125, + "mean_token_accuracy": 0.9396374526619912, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.19850988369435071, + "epoch": 4.412315930388219, + "grad_norm": 1.9034850597381592, + "learning_rate": 8.524888591065258e-05, + "loss": 0.7526986694335938, + "mean_token_accuracy": 0.9402479353547096, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.19905407220125199, + "epoch": 4.546184738955823, + "grad_norm": 2.1477949619293213, + "learning_rate": 8.253214352041379e-05, + "loss": 0.7603612518310547, + "mean_token_accuracy": 0.9396576225757599, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.20251497332006693, + "epoch": 4.680053547523427, + "grad_norm": 1.5489246845245361, + "learning_rate": 7.97710217155036e-05, + "loss": 0.7711930084228515, + "mean_token_accuracy": 0.9400961664319039, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.1991352306306362, + "epoch": 4.813922356091031, + "grad_norm": 1.969994068145752, + "learning_rate": 7.697153248632946e-05, + "loss": 0.7681967163085938, + "mean_token_accuracy": 0.9399621617794037, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.20229352474212647, + "epoch": 4.947791164658635, + "grad_norm": 2.2329719066619873, + "learning_rate": 7.41397713634694e-05, + "loss": 0.7733911895751953, + "mean_token_accuracy": 0.9396535342931748, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.270584502145648, + "eval_loss": 0.6255385875701904, + "eval_mean_token_accuracy": 0.8687835082411766, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.6331, + "eval_samples_per_second": 16.547, + "eval_steps_per_second": 2.07, + "step": 1870 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4947622783933181e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2148faba04b3eea9d8bc79cdd2f52c92b8cda9e7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.015034304668777832, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33cc61dc8bb5cddf4d6195fc66edf795c2ce13e8 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json @@ -0,0 +1,540 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 2244, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3355020767450332, + "epoch": 0.13386880856760375, + "grad_norm": 3.2956597805023193, + "learning_rate": 1.628530639938585e-05, + "loss": 5.349910278320312, + "mean_token_accuracy": 0.7383818039298058, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5958842460811138, + "epoch": 0.2677376171352075, + "grad_norm": 2.5947492122650146, + "learning_rate": 3.290296599059591e-05, + "loss": 2.312855072021484, + "mean_token_accuracy": 0.8520967712998391, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5190362003445625, + "epoch": 0.40160642570281124, + "grad_norm": 1.5038394927978516, + "learning_rate": 4.9520625581805955e-05, + "loss": 2.0574468994140624, + "mean_token_accuracy": 0.8657039344310761, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4922871346771717, + "epoch": 0.535475234270415, + "grad_norm": 1.645923137664795, + "learning_rate": 6.613828517301602e-05, + "loss": 1.916438446044922, + "mean_token_accuracy": 0.8717759534716606, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.491110111027956, + "epoch": 0.6693440428380187, + "grad_norm": 1.866817593574524, + "learning_rate": 8.275594476422607e-05, + "loss": 1.9421713256835937, + "mean_token_accuracy": 0.8710730043053627, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.47134352535009383, + "epoch": 0.8032128514056225, + "grad_norm": 117.62409210205078, + "learning_rate": 9.937360435543611e-05, + "loss": 1.9768324279785157, + "mean_token_accuracy": 0.8741078078746796, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.4820582258701325, + "epoch": 0.9370816599732262, + "grad_norm": 2.3274827003479004, + "learning_rate": 0.00011599126394664616, + "loss": 2.2025875854492187, + "mean_token_accuracy": 0.8697148504853248, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5010400542616844, + "eval_loss": 0.5114277601242065, + "eval_mean_token_accuracy": 0.8587275749444961, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.5515, + "eval_samples_per_second": 16.561, + "eval_steps_per_second": 2.071, + "step": 374 + }, + { + "entropy": 0.4708875769918615, + "epoch": 1.069611780455154, + "grad_norm": 3.3712940216064453, + "learning_rate": 0.00012428317596508976, + "loss": 1.83294189453125, + "mean_token_accuracy": 0.8772370366737096, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.44804590195417404, + "epoch": 1.2034805890227578, + "grad_norm": 1.4833389520645142, + "learning_rate": 0.00012414788900475706, + "loss": 1.7768891906738282, + "mean_token_accuracy": 0.8791097947955131, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.4510513086616993, + "epoch": 1.3373493975903614, + "grad_norm": 2.814790964126587, + "learning_rate": 0.00012387760965418496, + "loss": 1.7745071411132813, + "mean_token_accuracy": 0.8813075706362724, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.4479117552936077, + "epoch": 1.4712182061579653, + "grad_norm": 1.855610728263855, + "learning_rate": 0.00012347292641217135, + "loss": 1.7583291625976563, + "mean_token_accuracy": 0.8815277495980263, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4380264139175415, + "epoch": 1.605087014725569, + "grad_norm": 1.383190631866455, + "learning_rate": 0.00012293472042483757, + "loss": 1.7229583740234375, + "mean_token_accuracy": 0.8832098203897476, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.4342571949958801, + "epoch": 1.7389558232931726, + "grad_norm": 1.4977834224700928, + "learning_rate": 0.00012226416356704526, + "loss": 1.7174737548828125, + "mean_token_accuracy": 0.8834967383742333, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.42700962007045745, + "epoch": 1.8728246318607764, + "grad_norm": 1.6156537532806396, + "learning_rate": 0.00012146271589078838, + "loss": 1.682061767578125, + "mean_token_accuracy": 0.8858474844694137, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4838937771320343, + "eval_loss": 0.4826815128326416, + "eval_mean_token_accuracy": 0.8682844692468643, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5071, + "eval_samples_per_second": 16.569, + "eval_steps_per_second": 2.072, + "step": 748 + }, + { + "entropy": 0.4378527848407476, + "epoch": 2.005354752342704, + "grad_norm": 1.400229573249817, + "learning_rate": 0.0001205321224461161, + "loss": 1.7096096801757812, + "mean_token_accuracy": 0.8838462468349573, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.3559799794852734, + "epoch": 2.139223560910308, + "grad_norm": 1.7168083190917969, + "learning_rate": 0.0001194744094815093, + "loss": 1.3893603515625, + "mean_token_accuracy": 0.9004731178283691, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3671448823064566, + "epoch": 2.2730923694779115, + "grad_norm": 1.9720135927200317, + "learning_rate": 0.00011829188003198282, + "loss": 1.429988555908203, + "mean_token_accuracy": 0.8970818132162094, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3597494306415319, + "epoch": 2.4069611780455156, + "grad_norm": 1.4947372674942017, + "learning_rate": 0.00011698710890452068, + "loss": 1.418173828125, + "mean_token_accuracy": 0.8994651186466217, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.36254502907395364, + "epoch": 2.540829986613119, + "grad_norm": 1.6768454313278198, + "learning_rate": 0.00011556293707176242, + "loss": 1.4158590698242188, + "mean_token_accuracy": 0.8995477721095085, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.36290778368711474, + "epoch": 2.674698795180723, + "grad_norm": 1.6033697128295898, + "learning_rate": 0.00011402246548614765, + "loss": 1.4300469970703125, + "mean_token_accuracy": 0.8986452376842499, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.3635872249305248, + "epoch": 2.8085676037483265, + "grad_norm": 1.546893835067749, + "learning_rate": 0.00011236904832798785, + "loss": 1.42587646484375, + "mean_token_accuracy": 0.9003903394937516, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.36871150620281695, + "epoch": 2.9424364123159306, + "grad_norm": 1.2951405048370361, + "learning_rate": 0.0001106062857021667, + "loss": 1.448046875, + "mean_token_accuracy": 0.8967258337140084, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4225208269059658, + "eval_loss": 0.489418089389801, + "eval_mean_token_accuracy": 0.8697815361618996, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.4058, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.075, + "step": 1122 + }, + { + "entropy": 0.3120347365285411, + "epoch": 3.074966532797858, + "grad_norm": 1.639520287513733, + "learning_rate": 0.00010873801579937106, + "loss": 1.1941973876953125, + "mean_token_accuracy": 0.9117801315856703, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.28257040068507194, + "epoch": 3.208835341365462, + "grad_norm": 1.7459681034088135, + "learning_rate": 0.00010676830653892058, + "loss": 1.0850601196289062, + "mean_token_accuracy": 0.9177472350001336, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.27802520349621773, + "epoch": 3.3427041499330654, + "grad_norm": 1.5176103115081787, + "learning_rate": 0.00010470144671139238, + "loss": 1.0840838623046876, + "mean_token_accuracy": 0.9179763168096542, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.280417420566082, + "epoch": 3.4765729585006695, + "grad_norm": 1.3774974346160889, + "learning_rate": 0.00010254193664032686, + "loss": 1.0911756896972655, + "mean_token_accuracy": 0.9162956389784813, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.2834589210152626, + "epoch": 3.610441767068273, + "grad_norm": 1.5929396152496338, + "learning_rate": 0.00010029447838334742, + "loss": 1.0985262298583984, + "mean_token_accuracy": 0.9174074530601501, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.282296127229929, + "epoch": 3.7443105756358768, + "grad_norm": 1.50350022315979, + "learning_rate": 9.796396549403e-05, + "loss": 1.101386260986328, + "mean_token_accuracy": 0.9168545073270797, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.279728781580925, + "epoch": 3.878179384203481, + "grad_norm": 1.4728187322616577, + "learning_rate": 9.555547236681456e-05, + "loss": 1.0859880065917968, + "mean_token_accuracy": 0.9178367125988006, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34304031178355215, + "eval_loss": 0.5295785665512085, + "eval_mean_token_accuracy": 0.8698753178119659, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.3616, + "eval_samples_per_second": 16.594, + "eval_steps_per_second": 2.076, + "step": 1496 + }, + { + "entropy": 0.27893446536377225, + "epoch": 4.010709504685408, + "grad_norm": 1.545491337776184, + "learning_rate": 9.30742431881587e-05, + "loss": 1.0577442169189453, + "mean_token_accuracy": 0.9191552999645772, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.19769302535802125, + "epoch": 4.144578313253012, + "grad_norm": 2.10296893119812, + "learning_rate": 9.052568051799083e-05, + "loss": 0.7461458587646485, + "mean_token_accuracy": 0.9415343621373177, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.1981763695180416, + "epoch": 4.278447121820616, + "grad_norm": 2.067410945892334, + "learning_rate": 8.791533352632524e-05, + "loss": 0.7580889892578125, + "mean_token_accuracy": 0.9396374526619912, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.19850988369435071, + "epoch": 4.412315930388219, + "grad_norm": 1.9034850597381592, + "learning_rate": 8.524888591065258e-05, + "loss": 0.7526986694335938, + "mean_token_accuracy": 0.9402479353547096, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.19905407220125199, + "epoch": 4.546184738955823, + "grad_norm": 2.1477949619293213, + "learning_rate": 8.253214352041379e-05, + "loss": 0.7603612518310547, + "mean_token_accuracy": 0.9396576225757599, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.20251497332006693, + "epoch": 4.680053547523427, + "grad_norm": 1.5489246845245361, + "learning_rate": 7.97710217155036e-05, + "loss": 0.7711930084228515, + "mean_token_accuracy": 0.9400961664319039, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.1991352306306362, + "epoch": 4.813922356091031, + "grad_norm": 1.969994068145752, + "learning_rate": 7.697153248632946e-05, + "loss": 0.7681967163085938, + "mean_token_accuracy": 0.9399621617794037, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.20229352474212647, + "epoch": 4.947791164658635, + "grad_norm": 2.2329719066619873, + "learning_rate": 7.41397713634694e-05, + "loss": 0.7733911895751953, + "mean_token_accuracy": 0.9396535342931748, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.270584502145648, + "eval_loss": 0.6255385875701904, + "eval_mean_token_accuracy": 0.8687835082411766, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.6331, + "eval_samples_per_second": 16.547, + "eval_steps_per_second": 2.07, + "step": 1870 + }, + { + "entropy": 0.16372355209155517, + "epoch": 5.080321285140562, + "grad_norm": 8.029130935668945, + "learning_rate": 7.128190414543193e-05, + "loss": 0.6145073699951172, + "mean_token_accuracy": 0.9516371590922578, + "num_tokens": 4434412.0, + "step": 1900 + }, + { + "entropy": 0.14057113960385323, + "epoch": 5.214190093708166, + "grad_norm": 2.23626446723938, + "learning_rate": 6.840415347341672e-05, + "loss": 0.5295140075683594, + "mean_token_accuracy": 0.9593333688378334, + "num_tokens": 4548703.0, + "step": 1950 + }, + { + "entropy": 0.14139273861423135, + "epoch": 5.34805890227577, + "grad_norm": 2.0157318115234375, + "learning_rate": 6.551278528230729e-05, + "loss": 0.5296827697753906, + "mean_token_accuracy": 0.9590813705325126, + "num_tokens": 4665542.0, + "step": 2000 + }, + { + "entropy": 0.14537794288247824, + "epoch": 5.481927710843373, + "grad_norm": 1.5371013879776, + "learning_rate": 6.261409515739736e-05, + "loss": 0.5478645706176758, + "mean_token_accuracy": 0.9577724316716194, + "num_tokens": 4778075.0, + "step": 2050 + }, + { + "entropy": 0.14534839443862438, + "epoch": 5.615796519410977, + "grad_norm": 2.0134589672088623, + "learning_rate": 5.971439462655727e-05, + "loss": 0.5426230239868164, + "mean_token_accuracy": 0.9581041479110718, + "num_tokens": 4897453.0, + "step": 2100 + }, + { + "entropy": 0.14614912170916797, + "epoch": 5.749665327978581, + "grad_norm": 1.286437749862671, + "learning_rate": 5.6819997417687274e-05, + "loss": 0.5487421798706055, + "mean_token_accuracy": 0.9563529288768768, + "num_tokens": 5012767.0, + "step": 2150 + }, + { + "entropy": 0.13987606402486563, + "epoch": 5.883534136546185, + "grad_norm": 1.7586702108383179, + "learning_rate": 5.393720571138079e-05, + "loss": 0.5254617309570313, + "mean_token_accuracy": 0.9590577334165573, + "num_tokens": 5129878.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.2240281231701374, + "eval_loss": 0.7485206723213196, + "eval_mean_token_accuracy": 0.8668996468186378, + "eval_num_tokens": 5233482.0, + "eval_runtime": 96.4089, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.074, + "step": 2244 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7914914724245857e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2148faba04b3eea9d8bc79cdd2f52c92b8cda9e7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.015034304668777832, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5433ad385e82359d7b5946b9979d051f55eeeb93 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json @@ -0,0 +1,631 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 2618, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3355020767450332, + "epoch": 0.13386880856760375, + "grad_norm": 3.2956597805023193, + "learning_rate": 1.628530639938585e-05, + "loss": 5.349910278320312, + "mean_token_accuracy": 0.7383818039298058, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5958842460811138, + "epoch": 0.2677376171352075, + "grad_norm": 2.5947492122650146, + "learning_rate": 3.290296599059591e-05, + "loss": 2.312855072021484, + "mean_token_accuracy": 0.8520967712998391, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5190362003445625, + "epoch": 0.40160642570281124, + "grad_norm": 1.5038394927978516, + "learning_rate": 4.9520625581805955e-05, + "loss": 2.0574468994140624, + "mean_token_accuracy": 0.8657039344310761, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4922871346771717, + "epoch": 0.535475234270415, + "grad_norm": 1.645923137664795, + "learning_rate": 6.613828517301602e-05, + "loss": 1.916438446044922, + "mean_token_accuracy": 0.8717759534716606, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.491110111027956, + "epoch": 0.6693440428380187, + "grad_norm": 1.866817593574524, + "learning_rate": 8.275594476422607e-05, + "loss": 1.9421713256835937, + "mean_token_accuracy": 0.8710730043053627, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.47134352535009383, + "epoch": 0.8032128514056225, + "grad_norm": 117.62409210205078, + "learning_rate": 9.937360435543611e-05, + "loss": 1.9768324279785157, + "mean_token_accuracy": 0.8741078078746796, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.4820582258701325, + "epoch": 0.9370816599732262, + "grad_norm": 2.3274827003479004, + "learning_rate": 0.00011599126394664616, + "loss": 2.2025875854492187, + "mean_token_accuracy": 0.8697148504853248, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5010400542616844, + "eval_loss": 0.5114277601242065, + "eval_mean_token_accuracy": 0.8587275749444961, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.5515, + "eval_samples_per_second": 16.561, + "eval_steps_per_second": 2.071, + "step": 374 + }, + { + "entropy": 0.4708875769918615, + "epoch": 1.069611780455154, + "grad_norm": 3.3712940216064453, + "learning_rate": 0.00012428317596508976, + "loss": 1.83294189453125, + "mean_token_accuracy": 0.8772370366737096, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.44804590195417404, + "epoch": 1.2034805890227578, + "grad_norm": 1.4833389520645142, + "learning_rate": 0.00012414788900475706, + "loss": 1.7768891906738282, + "mean_token_accuracy": 0.8791097947955131, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.4510513086616993, + "epoch": 1.3373493975903614, + "grad_norm": 2.814790964126587, + "learning_rate": 0.00012387760965418496, + "loss": 1.7745071411132813, + "mean_token_accuracy": 0.8813075706362724, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.4479117552936077, + "epoch": 1.4712182061579653, + "grad_norm": 1.855610728263855, + "learning_rate": 0.00012347292641217135, + "loss": 1.7583291625976563, + "mean_token_accuracy": 0.8815277495980263, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4380264139175415, + "epoch": 1.605087014725569, + "grad_norm": 1.383190631866455, + "learning_rate": 0.00012293472042483757, + "loss": 1.7229583740234375, + "mean_token_accuracy": 0.8832098203897476, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.4342571949958801, + "epoch": 1.7389558232931726, + "grad_norm": 1.4977834224700928, + "learning_rate": 0.00012226416356704526, + "loss": 1.7174737548828125, + "mean_token_accuracy": 0.8834967383742333, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.42700962007045745, + "epoch": 1.8728246318607764, + "grad_norm": 1.6156537532806396, + "learning_rate": 0.00012146271589078838, + "loss": 1.682061767578125, + "mean_token_accuracy": 0.8858474844694137, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4838937771320343, + "eval_loss": 0.4826815128326416, + "eval_mean_token_accuracy": 0.8682844692468643, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5071, + "eval_samples_per_second": 16.569, + "eval_steps_per_second": 2.072, + "step": 748 + }, + { + "entropy": 0.4378527848407476, + "epoch": 2.005354752342704, + "grad_norm": 1.400229573249817, + "learning_rate": 0.0001205321224461161, + "loss": 1.7096096801757812, + "mean_token_accuracy": 0.8838462468349573, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.3559799794852734, + "epoch": 2.139223560910308, + "grad_norm": 1.7168083190917969, + "learning_rate": 0.0001194744094815093, + "loss": 1.3893603515625, + "mean_token_accuracy": 0.9004731178283691, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3671448823064566, + "epoch": 2.2730923694779115, + "grad_norm": 1.9720135927200317, + "learning_rate": 0.00011829188003198282, + "loss": 1.429988555908203, + "mean_token_accuracy": 0.8970818132162094, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3597494306415319, + "epoch": 2.4069611780455156, + "grad_norm": 1.4947372674942017, + "learning_rate": 0.00011698710890452068, + "loss": 1.418173828125, + "mean_token_accuracy": 0.8994651186466217, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.36254502907395364, + "epoch": 2.540829986613119, + "grad_norm": 1.6768454313278198, + "learning_rate": 0.00011556293707176242, + "loss": 1.4158590698242188, + "mean_token_accuracy": 0.8995477721095085, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.36290778368711474, + "epoch": 2.674698795180723, + "grad_norm": 1.6033697128295898, + "learning_rate": 0.00011402246548614765, + "loss": 1.4300469970703125, + "mean_token_accuracy": 0.8986452376842499, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.3635872249305248, + "epoch": 2.8085676037483265, + "grad_norm": 1.546893835067749, + "learning_rate": 0.00011236904832798785, + "loss": 1.42587646484375, + "mean_token_accuracy": 0.9003903394937516, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.36871150620281695, + "epoch": 2.9424364123159306, + "grad_norm": 1.2951405048370361, + "learning_rate": 0.0001106062857021667, + "loss": 1.448046875, + "mean_token_accuracy": 0.8967258337140084, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4225208269059658, + "eval_loss": 0.489418089389801, + "eval_mean_token_accuracy": 0.8697815361618996, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.4058, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.075, + "step": 1122 + }, + { + "entropy": 0.3120347365285411, + "epoch": 3.074966532797858, + "grad_norm": 1.639520287513733, + "learning_rate": 0.00010873801579937106, + "loss": 1.1941973876953125, + "mean_token_accuracy": 0.9117801315856703, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.28257040068507194, + "epoch": 3.208835341365462, + "grad_norm": 1.7459681034088135, + "learning_rate": 0.00010676830653892058, + "loss": 1.0850601196289062, + "mean_token_accuracy": 0.9177472350001336, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.27802520349621773, + "epoch": 3.3427041499330654, + "grad_norm": 1.5176103115081787, + "learning_rate": 0.00010470144671139238, + "loss": 1.0840838623046876, + "mean_token_accuracy": 0.9179763168096542, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.280417420566082, + "epoch": 3.4765729585006695, + "grad_norm": 1.3774974346160889, + "learning_rate": 0.00010254193664032686, + "loss": 1.0911756896972655, + "mean_token_accuracy": 0.9162956389784813, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.2834589210152626, + "epoch": 3.610441767068273, + "grad_norm": 1.5929396152496338, + "learning_rate": 0.00010029447838334742, + "loss": 1.0985262298583984, + "mean_token_accuracy": 0.9174074530601501, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.282296127229929, + "epoch": 3.7443105756358768, + "grad_norm": 1.50350022315979, + "learning_rate": 9.796396549403e-05, + "loss": 1.101386260986328, + "mean_token_accuracy": 0.9168545073270797, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.279728781580925, + "epoch": 3.878179384203481, + "grad_norm": 1.4728187322616577, + "learning_rate": 9.555547236681456e-05, + "loss": 1.0859880065917968, + "mean_token_accuracy": 0.9178367125988006, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34304031178355215, + "eval_loss": 0.5295785665512085, + "eval_mean_token_accuracy": 0.8698753178119659, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.3616, + "eval_samples_per_second": 16.594, + "eval_steps_per_second": 2.076, + "step": 1496 + }, + { + "entropy": 0.27893446536377225, + "epoch": 4.010709504685408, + "grad_norm": 1.545491337776184, + "learning_rate": 9.30742431881587e-05, + "loss": 1.0577442169189453, + "mean_token_accuracy": 0.9191552999645772, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.19769302535802125, + "epoch": 4.144578313253012, + "grad_norm": 2.10296893119812, + "learning_rate": 9.052568051799083e-05, + "loss": 0.7461458587646485, + "mean_token_accuracy": 0.9415343621373177, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.1981763695180416, + "epoch": 4.278447121820616, + "grad_norm": 2.067410945892334, + "learning_rate": 8.791533352632524e-05, + "loss": 0.7580889892578125, + "mean_token_accuracy": 0.9396374526619912, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.19850988369435071, + "epoch": 4.412315930388219, + "grad_norm": 1.9034850597381592, + "learning_rate": 8.524888591065258e-05, + "loss": 0.7526986694335938, + "mean_token_accuracy": 0.9402479353547096, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.19905407220125199, + "epoch": 4.546184738955823, + "grad_norm": 2.1477949619293213, + "learning_rate": 8.253214352041379e-05, + "loss": 0.7603612518310547, + "mean_token_accuracy": 0.9396576225757599, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.20251497332006693, + "epoch": 4.680053547523427, + "grad_norm": 1.5489246845245361, + "learning_rate": 7.97710217155036e-05, + "loss": 0.7711930084228515, + "mean_token_accuracy": 0.9400961664319039, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.1991352306306362, + "epoch": 4.813922356091031, + "grad_norm": 1.969994068145752, + "learning_rate": 7.697153248632946e-05, + "loss": 0.7681967163085938, + "mean_token_accuracy": 0.9399621617794037, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.20229352474212647, + "epoch": 4.947791164658635, + "grad_norm": 2.2329719066619873, + "learning_rate": 7.41397713634694e-05, + "loss": 0.7733911895751953, + "mean_token_accuracy": 0.9396535342931748, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.270584502145648, + "eval_loss": 0.6255385875701904, + "eval_mean_token_accuracy": 0.8687835082411766, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.6331, + "eval_samples_per_second": 16.547, + "eval_steps_per_second": 2.07, + "step": 1870 + }, + { + "entropy": 0.16372355209155517, + "epoch": 5.080321285140562, + "grad_norm": 8.029130935668945, + "learning_rate": 7.128190414543193e-05, + "loss": 0.6145073699951172, + "mean_token_accuracy": 0.9516371590922578, + "num_tokens": 4434412.0, + "step": 1900 + }, + { + "entropy": 0.14057113960385323, + "epoch": 5.214190093708166, + "grad_norm": 2.23626446723938, + "learning_rate": 6.840415347341672e-05, + "loss": 0.5295140075683594, + "mean_token_accuracy": 0.9593333688378334, + "num_tokens": 4548703.0, + "step": 1950 + }, + { + "entropy": 0.14139273861423135, + "epoch": 5.34805890227577, + "grad_norm": 2.0157318115234375, + "learning_rate": 6.551278528230729e-05, + "loss": 0.5296827697753906, + "mean_token_accuracy": 0.9590813705325126, + "num_tokens": 4665542.0, + "step": 2000 + }, + { + "entropy": 0.14537794288247824, + "epoch": 5.481927710843373, + "grad_norm": 1.5371013879776, + "learning_rate": 6.261409515739736e-05, + "loss": 0.5478645706176758, + "mean_token_accuracy": 0.9577724316716194, + "num_tokens": 4778075.0, + "step": 2050 + }, + { + "entropy": 0.14534839443862438, + "epoch": 5.615796519410977, + "grad_norm": 2.0134589672088623, + "learning_rate": 5.971439462655727e-05, + "loss": 0.5426230239868164, + "mean_token_accuracy": 0.9581041479110718, + "num_tokens": 4897453.0, + "step": 2100 + }, + { + "entropy": 0.14614912170916797, + "epoch": 5.749665327978581, + "grad_norm": 1.286437749862671, + "learning_rate": 5.6819997417687274e-05, + "loss": 0.5487421798706055, + "mean_token_accuracy": 0.9563529288768768, + "num_tokens": 5012767.0, + "step": 2150 + }, + { + "entropy": 0.13987606402486563, + "epoch": 5.883534136546185, + "grad_norm": 1.7586702108383179, + "learning_rate": 5.393720571138079e-05, + "loss": 0.5254617309570313, + "mean_token_accuracy": 0.9590577334165573, + "num_tokens": 5129878.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.2240281231701374, + "eval_loss": 0.7485206723213196, + "eval_mean_token_accuracy": 0.8668996468186378, + "eval_num_tokens": 5233482.0, + "eval_runtime": 96.4089, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.074, + "step": 2244 + }, + { + "entropy": 0.1413771447283451, + "epoch": 6.016064257028113, + "grad_norm": 1.2926467657089233, + "learning_rate": 5.1072296418730254e-05, + "loss": 0.5202234649658203, + "mean_token_accuracy": 0.9594009392189257, + "num_tokens": 5246734.0, + "step": 2250 + }, + { + "entropy": 0.1042403375543654, + "epoch": 6.149933065595716, + "grad_norm": 1.9540276527404785, + "learning_rate": 4.8231507514154216e-05, + "loss": 0.39597846984863283, + "mean_token_accuracy": 0.9706364983320236, + "num_tokens": 5366334.0, + "step": 2300 + }, + { + "entropy": 0.10351455600932241, + "epoch": 6.28380187416332, + "grad_norm": 2.139054775238037, + "learning_rate": 4.542102445300397e-05, + "loss": 0.38731266021728517, + "mean_token_accuracy": 0.9703371664881706, + "num_tokens": 5487013.0, + "step": 2350 + }, + { + "entropy": 0.11232182893902064, + "epoch": 6.417670682730924, + "grad_norm": 1.6526401042938232, + "learning_rate": 4.264696670352381e-05, + "loss": 0.42091716766357423, + "mean_token_accuracy": 0.9684987756609916, + "num_tokens": 5599415.0, + "step": 2400 + }, + { + "entropy": 0.10796859875321388, + "epoch": 6.551539491298527, + "grad_norm": 1.297956109046936, + "learning_rate": 3.9915374422489785e-05, + "loss": 0.40640792846679685, + "mean_token_accuracy": 0.9703203043341637, + "num_tokens": 5718099.0, + "step": 2450 + }, + { + "entropy": 0.10999857917428017, + "epoch": 6.685408299866131, + "grad_norm": 1.5105161666870117, + "learning_rate": 3.723219530353909e-05, + "loss": 0.4118352508544922, + "mean_token_accuracy": 0.9697986772656441, + "num_tokens": 5833902.0, + "step": 2500 + }, + { + "entropy": 0.11099046738818288, + "epoch": 6.8192771084337345, + "grad_norm": 1.8809560537338257, + "learning_rate": 3.460327162682602e-05, + "loss": 0.41624794006347654, + "mean_token_accuracy": 0.9690032437443733, + "num_tokens": 5948132.0, + "step": 2550 + }, + { + "entropy": 0.11062245365232229, + "epoch": 6.953145917001339, + "grad_norm": 1.0219827890396118, + "learning_rate": 3.2034327538202464e-05, + "loss": 0.41484325408935546, + "mean_token_accuracy": 0.9690453514456749, + "num_tokens": 6066224.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.18908375523984433, + "eval_loss": 0.8491571545600891, + "eval_mean_token_accuracy": 0.8642131051421166, + "eval_num_tokens": 6105729.0, + "eval_runtime": 96.4633, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.073, + "step": 2618 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0923154774653926e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2148faba04b3eea9d8bc79cdd2f52c92b8cda9e7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.015034304668777832, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..18c30511083a57d4153d95c00daf86622ccbef21 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/trainer_state.json @@ -0,0 +1,712 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 2992, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3355020767450332, + "epoch": 0.13386880856760375, + "grad_norm": 3.2956597805023193, + "learning_rate": 1.628530639938585e-05, + "loss": 5.349910278320312, + "mean_token_accuracy": 0.7383818039298058, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5958842460811138, + "epoch": 0.2677376171352075, + "grad_norm": 2.5947492122650146, + "learning_rate": 3.290296599059591e-05, + "loss": 2.312855072021484, + "mean_token_accuracy": 0.8520967712998391, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5190362003445625, + "epoch": 0.40160642570281124, + "grad_norm": 1.5038394927978516, + "learning_rate": 4.9520625581805955e-05, + "loss": 2.0574468994140624, + "mean_token_accuracy": 0.8657039344310761, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4922871346771717, + "epoch": 0.535475234270415, + "grad_norm": 1.645923137664795, + "learning_rate": 6.613828517301602e-05, + "loss": 1.916438446044922, + "mean_token_accuracy": 0.8717759534716606, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.491110111027956, + "epoch": 0.6693440428380187, + "grad_norm": 1.866817593574524, + "learning_rate": 8.275594476422607e-05, + "loss": 1.9421713256835937, + "mean_token_accuracy": 0.8710730043053627, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.47134352535009383, + "epoch": 0.8032128514056225, + "grad_norm": 117.62409210205078, + "learning_rate": 9.937360435543611e-05, + "loss": 1.9768324279785157, + "mean_token_accuracy": 0.8741078078746796, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.4820582258701325, + "epoch": 0.9370816599732262, + "grad_norm": 2.3274827003479004, + "learning_rate": 0.00011599126394664616, + "loss": 2.2025875854492187, + "mean_token_accuracy": 0.8697148504853248, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5010400542616844, + "eval_loss": 0.5114277601242065, + "eval_mean_token_accuracy": 0.8587275749444961, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.5515, + "eval_samples_per_second": 16.561, + "eval_steps_per_second": 2.071, + "step": 374 + }, + { + "entropy": 0.4708875769918615, + "epoch": 1.069611780455154, + "grad_norm": 3.3712940216064453, + "learning_rate": 0.00012428317596508976, + "loss": 1.83294189453125, + "mean_token_accuracy": 0.8772370366737096, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.44804590195417404, + "epoch": 1.2034805890227578, + "grad_norm": 1.4833389520645142, + "learning_rate": 0.00012414788900475706, + "loss": 1.7768891906738282, + "mean_token_accuracy": 0.8791097947955131, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.4510513086616993, + "epoch": 1.3373493975903614, + "grad_norm": 2.814790964126587, + "learning_rate": 0.00012387760965418496, + "loss": 1.7745071411132813, + "mean_token_accuracy": 0.8813075706362724, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.4479117552936077, + "epoch": 1.4712182061579653, + "grad_norm": 1.855610728263855, + "learning_rate": 0.00012347292641217135, + "loss": 1.7583291625976563, + "mean_token_accuracy": 0.8815277495980263, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4380264139175415, + "epoch": 1.605087014725569, + "grad_norm": 1.383190631866455, + "learning_rate": 0.00012293472042483757, + "loss": 1.7229583740234375, + "mean_token_accuracy": 0.8832098203897476, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.4342571949958801, + "epoch": 1.7389558232931726, + "grad_norm": 1.4977834224700928, + "learning_rate": 0.00012226416356704526, + "loss": 1.7174737548828125, + "mean_token_accuracy": 0.8834967383742333, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.42700962007045745, + "epoch": 1.8728246318607764, + "grad_norm": 1.6156537532806396, + "learning_rate": 0.00012146271589078838, + "loss": 1.682061767578125, + "mean_token_accuracy": 0.8858474844694137, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4838937771320343, + "eval_loss": 0.4826815128326416, + "eval_mean_token_accuracy": 0.8682844692468643, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5071, + "eval_samples_per_second": 16.569, + "eval_steps_per_second": 2.072, + "step": 748 + }, + { + "entropy": 0.4378527848407476, + "epoch": 2.005354752342704, + "grad_norm": 1.400229573249817, + "learning_rate": 0.0001205321224461161, + "loss": 1.7096096801757812, + "mean_token_accuracy": 0.8838462468349573, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.3559799794852734, + "epoch": 2.139223560910308, + "grad_norm": 1.7168083190917969, + "learning_rate": 0.0001194744094815093, + "loss": 1.3893603515625, + "mean_token_accuracy": 0.9004731178283691, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3671448823064566, + "epoch": 2.2730923694779115, + "grad_norm": 1.9720135927200317, + "learning_rate": 0.00011829188003198282, + "loss": 1.429988555908203, + "mean_token_accuracy": 0.8970818132162094, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3597494306415319, + "epoch": 2.4069611780455156, + "grad_norm": 1.4947372674942017, + "learning_rate": 0.00011698710890452068, + "loss": 1.418173828125, + "mean_token_accuracy": 0.8994651186466217, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.36254502907395364, + "epoch": 2.540829986613119, + "grad_norm": 1.6768454313278198, + "learning_rate": 0.00011556293707176242, + "loss": 1.4158590698242188, + "mean_token_accuracy": 0.8995477721095085, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.36290778368711474, + "epoch": 2.674698795180723, + "grad_norm": 1.6033697128295898, + "learning_rate": 0.00011402246548614765, + "loss": 1.4300469970703125, + "mean_token_accuracy": 0.8986452376842499, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.3635872249305248, + "epoch": 2.8085676037483265, + "grad_norm": 1.546893835067749, + "learning_rate": 0.00011236904832798785, + "loss": 1.42587646484375, + "mean_token_accuracy": 0.9003903394937516, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.36871150620281695, + "epoch": 2.9424364123159306, + "grad_norm": 1.2951405048370361, + "learning_rate": 0.0001106062857021667, + "loss": 1.448046875, + "mean_token_accuracy": 0.8967258337140084, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4225208269059658, + "eval_loss": 0.489418089389801, + "eval_mean_token_accuracy": 0.8697815361618996, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.4058, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.075, + "step": 1122 + }, + { + "entropy": 0.3120347365285411, + "epoch": 3.074966532797858, + "grad_norm": 1.639520287513733, + "learning_rate": 0.00010873801579937106, + "loss": 1.1941973876953125, + "mean_token_accuracy": 0.9117801315856703, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.28257040068507194, + "epoch": 3.208835341365462, + "grad_norm": 1.7459681034088135, + "learning_rate": 0.00010676830653892058, + "loss": 1.0850601196289062, + "mean_token_accuracy": 0.9177472350001336, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.27802520349621773, + "epoch": 3.3427041499330654, + "grad_norm": 1.5176103115081787, + "learning_rate": 0.00010470144671139238, + "loss": 1.0840838623046876, + "mean_token_accuracy": 0.9179763168096542, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.280417420566082, + "epoch": 3.4765729585006695, + "grad_norm": 1.3774974346160889, + "learning_rate": 0.00010254193664032686, + "loss": 1.0911756896972655, + "mean_token_accuracy": 0.9162956389784813, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.2834589210152626, + "epoch": 3.610441767068273, + "grad_norm": 1.5929396152496338, + "learning_rate": 0.00010029447838334742, + "loss": 1.0985262298583984, + "mean_token_accuracy": 0.9174074530601501, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.282296127229929, + "epoch": 3.7443105756358768, + "grad_norm": 1.50350022315979, + "learning_rate": 9.796396549403e-05, + "loss": 1.101386260986328, + "mean_token_accuracy": 0.9168545073270797, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.279728781580925, + "epoch": 3.878179384203481, + "grad_norm": 1.4728187322616577, + "learning_rate": 9.555547236681456e-05, + "loss": 1.0859880065917968, + "mean_token_accuracy": 0.9178367125988006, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34304031178355215, + "eval_loss": 0.5295785665512085, + "eval_mean_token_accuracy": 0.8698753178119659, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.3616, + "eval_samples_per_second": 16.594, + "eval_steps_per_second": 2.076, + "step": 1496 + }, + { + "entropy": 0.27893446536377225, + "epoch": 4.010709504685408, + "grad_norm": 1.545491337776184, + "learning_rate": 9.30742431881587e-05, + "loss": 1.0577442169189453, + "mean_token_accuracy": 0.9191552999645772, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.19769302535802125, + "epoch": 4.144578313253012, + "grad_norm": 2.10296893119812, + "learning_rate": 9.052568051799083e-05, + "loss": 0.7461458587646485, + "mean_token_accuracy": 0.9415343621373177, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.1981763695180416, + "epoch": 4.278447121820616, + "grad_norm": 2.067410945892334, + "learning_rate": 8.791533352632524e-05, + "loss": 0.7580889892578125, + "mean_token_accuracy": 0.9396374526619912, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.19850988369435071, + "epoch": 4.412315930388219, + "grad_norm": 1.9034850597381592, + "learning_rate": 8.524888591065258e-05, + "loss": 0.7526986694335938, + "mean_token_accuracy": 0.9402479353547096, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.19905407220125199, + "epoch": 4.546184738955823, + "grad_norm": 2.1477949619293213, + "learning_rate": 8.253214352041379e-05, + "loss": 0.7603612518310547, + "mean_token_accuracy": 0.9396576225757599, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.20251497332006693, + "epoch": 4.680053547523427, + "grad_norm": 1.5489246845245361, + "learning_rate": 7.97710217155036e-05, + "loss": 0.7711930084228515, + "mean_token_accuracy": 0.9400961664319039, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.1991352306306362, + "epoch": 4.813922356091031, + "grad_norm": 1.969994068145752, + "learning_rate": 7.697153248632946e-05, + "loss": 0.7681967163085938, + "mean_token_accuracy": 0.9399621617794037, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.20229352474212647, + "epoch": 4.947791164658635, + "grad_norm": 2.2329719066619873, + "learning_rate": 7.41397713634694e-05, + "loss": 0.7733911895751953, + "mean_token_accuracy": 0.9396535342931748, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.270584502145648, + "eval_loss": 0.6255385875701904, + "eval_mean_token_accuracy": 0.8687835082411766, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.6331, + "eval_samples_per_second": 16.547, + "eval_steps_per_second": 2.07, + "step": 1870 + }, + { + "entropy": 0.16372355209155517, + "epoch": 5.080321285140562, + "grad_norm": 8.029130935668945, + "learning_rate": 7.128190414543193e-05, + "loss": 0.6145073699951172, + "mean_token_accuracy": 0.9516371590922578, + "num_tokens": 4434412.0, + "step": 1900 + }, + { + "entropy": 0.14057113960385323, + "epoch": 5.214190093708166, + "grad_norm": 2.23626446723938, + "learning_rate": 6.840415347341672e-05, + "loss": 0.5295140075683594, + "mean_token_accuracy": 0.9593333688378334, + "num_tokens": 4548703.0, + "step": 1950 + }, + { + "entropy": 0.14139273861423135, + "epoch": 5.34805890227577, + "grad_norm": 2.0157318115234375, + "learning_rate": 6.551278528230729e-05, + "loss": 0.5296827697753906, + "mean_token_accuracy": 0.9590813705325126, + "num_tokens": 4665542.0, + "step": 2000 + }, + { + "entropy": 0.14537794288247824, + "epoch": 5.481927710843373, + "grad_norm": 1.5371013879776, + "learning_rate": 6.261409515739736e-05, + "loss": 0.5478645706176758, + "mean_token_accuracy": 0.9577724316716194, + "num_tokens": 4778075.0, + "step": 2050 + }, + { + "entropy": 0.14534839443862438, + "epoch": 5.615796519410977, + "grad_norm": 2.0134589672088623, + "learning_rate": 5.971439462655727e-05, + "loss": 0.5426230239868164, + "mean_token_accuracy": 0.9581041479110718, + "num_tokens": 4897453.0, + "step": 2100 + }, + { + "entropy": 0.14614912170916797, + "epoch": 5.749665327978581, + "grad_norm": 1.286437749862671, + "learning_rate": 5.6819997417687274e-05, + "loss": 0.5487421798706055, + "mean_token_accuracy": 0.9563529288768768, + "num_tokens": 5012767.0, + "step": 2150 + }, + { + "entropy": 0.13987606402486563, + "epoch": 5.883534136546185, + "grad_norm": 1.7586702108383179, + "learning_rate": 5.393720571138079e-05, + "loss": 0.5254617309570313, + "mean_token_accuracy": 0.9590577334165573, + "num_tokens": 5129878.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.2240281231701374, + "eval_loss": 0.7485206723213196, + "eval_mean_token_accuracy": 0.8668996468186378, + "eval_num_tokens": 5233482.0, + "eval_runtime": 96.4089, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.074, + "step": 2244 + }, + { + "entropy": 0.1413771447283451, + "epoch": 6.016064257028113, + "grad_norm": 1.2926467657089233, + "learning_rate": 5.1072296418730254e-05, + "loss": 0.5202234649658203, + "mean_token_accuracy": 0.9594009392189257, + "num_tokens": 5246734.0, + "step": 2250 + }, + { + "entropy": 0.1042403375543654, + "epoch": 6.149933065595716, + "grad_norm": 1.9540276527404785, + "learning_rate": 4.8231507514154216e-05, + "loss": 0.39597846984863283, + "mean_token_accuracy": 0.9706364983320236, + "num_tokens": 5366334.0, + "step": 2300 + }, + { + "entropy": 0.10351455600932241, + "epoch": 6.28380187416332, + "grad_norm": 2.139054775238037, + "learning_rate": 4.542102445300397e-05, + "loss": 0.38731266021728517, + "mean_token_accuracy": 0.9703371664881706, + "num_tokens": 5487013.0, + "step": 2350 + }, + { + "entropy": 0.11232182893902064, + "epoch": 6.417670682730924, + "grad_norm": 1.6526401042938232, + "learning_rate": 4.264696670352381e-05, + "loss": 0.42091716766357423, + "mean_token_accuracy": 0.9684987756609916, + "num_tokens": 5599415.0, + "step": 2400 + }, + { + "entropy": 0.10796859875321388, + "epoch": 6.551539491298527, + "grad_norm": 1.297956109046936, + "learning_rate": 3.9915374422489785e-05, + "loss": 0.40640792846679685, + "mean_token_accuracy": 0.9703203043341637, + "num_tokens": 5718099.0, + "step": 2450 + }, + { + "entropy": 0.10999857917428017, + "epoch": 6.685408299866131, + "grad_norm": 1.5105161666870117, + "learning_rate": 3.723219530353909e-05, + "loss": 0.4118352508544922, + "mean_token_accuracy": 0.9697986772656441, + "num_tokens": 5833902.0, + "step": 2500 + }, + { + "entropy": 0.11099046738818288, + "epoch": 6.8192771084337345, + "grad_norm": 1.8809560537338257, + "learning_rate": 3.460327162682602e-05, + "loss": 0.41624794006347654, + "mean_token_accuracy": 0.9690032437443733, + "num_tokens": 5948132.0, + "step": 2550 + }, + { + "entropy": 0.11062245365232229, + "epoch": 6.953145917001339, + "grad_norm": 1.0219827890396118, + "learning_rate": 3.2034327538202464e-05, + "loss": 0.41484325408935546, + "mean_token_accuracy": 0.9690453514456749, + "num_tokens": 6066224.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.18908375523984433, + "eval_loss": 0.8491571545600891, + "eval_mean_token_accuracy": 0.8642131051421166, + "eval_num_tokens": 6105729.0, + "eval_runtime": 96.4633, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.073, + "step": 2618 + }, + { + "entropy": 0.09948956533664405, + "epoch": 7.085676037483267, + "grad_norm": 1.4661338329315186, + "learning_rate": 2.9530956585620777e-05, + "loss": 0.36354263305664064, + "mean_token_accuracy": 0.9727776297415146, + "num_tokens": 6183429.0, + "step": 2650 + }, + { + "entropy": 0.08666609892621636, + "epoch": 7.21954484605087, + "grad_norm": 1.9116477966308594, + "learning_rate": 2.7098609539896744e-05, + "loss": 0.3243706130981445, + "mean_token_accuracy": 0.9765083396434784, + "num_tokens": 6303432.0, + "step": 2700 + }, + { + "entropy": 0.09543853564187885, + "epoch": 7.353413654618474, + "grad_norm": 1.0068918466567993, + "learning_rate": 2.4742582526351715e-05, + "loss": 0.35761878967285154, + "mean_token_accuracy": 0.9740070801973343, + "num_tokens": 6414176.0, + "step": 2750 + }, + { + "entropy": 0.08997446410357952, + "epoch": 7.4872824631860775, + "grad_norm": 1.6730849742889404, + "learning_rate": 2.246800549317553e-05, + "loss": 0.33653587341308594, + "mean_token_accuracy": 0.9758713039755821, + "num_tokens": 6531772.0, + "step": 2800 + }, + { + "entropy": 0.08550533290952445, + "epoch": 7.621151271753681, + "grad_norm": 1.3010321855545044, + "learning_rate": 2.027983104161894e-05, + "loss": 0.3204774856567383, + "mean_token_accuracy": 0.977160106599331, + "num_tokens": 6655745.0, + "step": 2850 + }, + { + "entropy": 0.09146139286458492, + "epoch": 7.755020080321285, + "grad_norm": 2.1133384704589844, + "learning_rate": 1.8182823642336212e-05, + "loss": 0.3351753234863281, + "mean_token_accuracy": 0.9754938682913781, + "num_tokens": 6772303.0, + "step": 2900 + }, + { + "entropy": 0.08813748911023139, + "epoch": 7.888888888888889, + "grad_norm": 0.9765240550041199, + "learning_rate": 1.618154926135836e-05, + "loss": 0.3303861236572266, + "mean_token_accuracy": 0.9758572709560395, + "num_tokens": 6887254.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.153879771232605, + "eval_loss": 1.0034006834030151, + "eval_mean_token_accuracy": 0.8645920944213867, + "eval_num_tokens": 6977976.0, + "eval_runtime": 96.4871, + "eval_samples_per_second": 16.572, + "eval_steps_per_second": 2.073, + "step": 2992 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.393061170429429e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2148faba04b3eea9d8bc79cdd2f52c92b8cda9e7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.015034304668777832, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2a6e683789c900d4b972f670b04c8658bb6e254b --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3366/trainer_state.json @@ -0,0 +1,803 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 3366, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3355020767450332, + "epoch": 0.13386880856760375, + "grad_norm": 3.2956597805023193, + "learning_rate": 1.628530639938585e-05, + "loss": 5.349910278320312, + "mean_token_accuracy": 0.7383818039298058, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5958842460811138, + "epoch": 0.2677376171352075, + "grad_norm": 2.5947492122650146, + "learning_rate": 3.290296599059591e-05, + "loss": 2.312855072021484, + "mean_token_accuracy": 0.8520967712998391, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5190362003445625, + "epoch": 0.40160642570281124, + "grad_norm": 1.5038394927978516, + "learning_rate": 4.9520625581805955e-05, + "loss": 2.0574468994140624, + "mean_token_accuracy": 0.8657039344310761, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4922871346771717, + "epoch": 0.535475234270415, + "grad_norm": 1.645923137664795, + "learning_rate": 6.613828517301602e-05, + "loss": 1.916438446044922, + "mean_token_accuracy": 0.8717759534716606, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.491110111027956, + "epoch": 0.6693440428380187, + "grad_norm": 1.866817593574524, + "learning_rate": 8.275594476422607e-05, + "loss": 1.9421713256835937, + "mean_token_accuracy": 0.8710730043053627, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.47134352535009383, + "epoch": 0.8032128514056225, + "grad_norm": 117.62409210205078, + "learning_rate": 9.937360435543611e-05, + "loss": 1.9768324279785157, + "mean_token_accuracy": 0.8741078078746796, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.4820582258701325, + "epoch": 0.9370816599732262, + "grad_norm": 2.3274827003479004, + "learning_rate": 0.00011599126394664616, + "loss": 2.2025875854492187, + "mean_token_accuracy": 0.8697148504853248, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5010400542616844, + "eval_loss": 0.5114277601242065, + "eval_mean_token_accuracy": 0.8587275749444961, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.5515, + "eval_samples_per_second": 16.561, + "eval_steps_per_second": 2.071, + "step": 374 + }, + { + "entropy": 0.4708875769918615, + "epoch": 1.069611780455154, + "grad_norm": 3.3712940216064453, + "learning_rate": 0.00012428317596508976, + "loss": 1.83294189453125, + "mean_token_accuracy": 0.8772370366737096, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.44804590195417404, + "epoch": 1.2034805890227578, + "grad_norm": 1.4833389520645142, + "learning_rate": 0.00012414788900475706, + "loss": 1.7768891906738282, + "mean_token_accuracy": 0.8791097947955131, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.4510513086616993, + "epoch": 1.3373493975903614, + "grad_norm": 2.814790964126587, + "learning_rate": 0.00012387760965418496, + "loss": 1.7745071411132813, + "mean_token_accuracy": 0.8813075706362724, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.4479117552936077, + "epoch": 1.4712182061579653, + "grad_norm": 1.855610728263855, + "learning_rate": 0.00012347292641217135, + "loss": 1.7583291625976563, + "mean_token_accuracy": 0.8815277495980263, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4380264139175415, + "epoch": 1.605087014725569, + "grad_norm": 1.383190631866455, + "learning_rate": 0.00012293472042483757, + "loss": 1.7229583740234375, + "mean_token_accuracy": 0.8832098203897476, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.4342571949958801, + "epoch": 1.7389558232931726, + "grad_norm": 1.4977834224700928, + "learning_rate": 0.00012226416356704526, + "loss": 1.7174737548828125, + "mean_token_accuracy": 0.8834967383742333, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.42700962007045745, + "epoch": 1.8728246318607764, + "grad_norm": 1.6156537532806396, + "learning_rate": 0.00012146271589078838, + "loss": 1.682061767578125, + "mean_token_accuracy": 0.8858474844694137, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4838937771320343, + "eval_loss": 0.4826815128326416, + "eval_mean_token_accuracy": 0.8682844692468643, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5071, + "eval_samples_per_second": 16.569, + "eval_steps_per_second": 2.072, + "step": 748 + }, + { + "entropy": 0.4378527848407476, + "epoch": 2.005354752342704, + "grad_norm": 1.400229573249817, + "learning_rate": 0.0001205321224461161, + "loss": 1.7096096801757812, + "mean_token_accuracy": 0.8838462468349573, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.3559799794852734, + "epoch": 2.139223560910308, + "grad_norm": 1.7168083190917969, + "learning_rate": 0.0001194744094815093, + "loss": 1.3893603515625, + "mean_token_accuracy": 0.9004731178283691, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3671448823064566, + "epoch": 2.2730923694779115, + "grad_norm": 1.9720135927200317, + "learning_rate": 0.00011829188003198282, + "loss": 1.429988555908203, + "mean_token_accuracy": 0.8970818132162094, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3597494306415319, + "epoch": 2.4069611780455156, + "grad_norm": 1.4947372674942017, + "learning_rate": 0.00011698710890452068, + "loss": 1.418173828125, + "mean_token_accuracy": 0.8994651186466217, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.36254502907395364, + "epoch": 2.540829986613119, + "grad_norm": 1.6768454313278198, + "learning_rate": 0.00011556293707176242, + "loss": 1.4158590698242188, + "mean_token_accuracy": 0.8995477721095085, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.36290778368711474, + "epoch": 2.674698795180723, + "grad_norm": 1.6033697128295898, + "learning_rate": 0.00011402246548614765, + "loss": 1.4300469970703125, + "mean_token_accuracy": 0.8986452376842499, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.3635872249305248, + "epoch": 2.8085676037483265, + "grad_norm": 1.546893835067749, + "learning_rate": 0.00011236904832798785, + "loss": 1.42587646484375, + "mean_token_accuracy": 0.9003903394937516, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.36871150620281695, + "epoch": 2.9424364123159306, + "grad_norm": 1.2951405048370361, + "learning_rate": 0.0001106062857021667, + "loss": 1.448046875, + "mean_token_accuracy": 0.8967258337140084, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4225208269059658, + "eval_loss": 0.489418089389801, + "eval_mean_token_accuracy": 0.8697815361618996, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.4058, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.075, + "step": 1122 + }, + { + "entropy": 0.3120347365285411, + "epoch": 3.074966532797858, + "grad_norm": 1.639520287513733, + "learning_rate": 0.00010873801579937106, + "loss": 1.1941973876953125, + "mean_token_accuracy": 0.9117801315856703, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.28257040068507194, + "epoch": 3.208835341365462, + "grad_norm": 1.7459681034088135, + "learning_rate": 0.00010676830653892058, + "loss": 1.0850601196289062, + "mean_token_accuracy": 0.9177472350001336, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.27802520349621773, + "epoch": 3.3427041499330654, + "grad_norm": 1.5176103115081787, + "learning_rate": 0.00010470144671139238, + "loss": 1.0840838623046876, + "mean_token_accuracy": 0.9179763168096542, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.280417420566082, + "epoch": 3.4765729585006695, + "grad_norm": 1.3774974346160889, + "learning_rate": 0.00010254193664032686, + "loss": 1.0911756896972655, + "mean_token_accuracy": 0.9162956389784813, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.2834589210152626, + "epoch": 3.610441767068273, + "grad_norm": 1.5929396152496338, + "learning_rate": 0.00010029447838334742, + "loss": 1.0985262298583984, + "mean_token_accuracy": 0.9174074530601501, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.282296127229929, + "epoch": 3.7443105756358768, + "grad_norm": 1.50350022315979, + "learning_rate": 9.796396549403e-05, + "loss": 1.101386260986328, + "mean_token_accuracy": 0.9168545073270797, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.279728781580925, + "epoch": 3.878179384203481, + "grad_norm": 1.4728187322616577, + "learning_rate": 9.555547236681456e-05, + "loss": 1.0859880065917968, + "mean_token_accuracy": 0.9178367125988006, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34304031178355215, + "eval_loss": 0.5295785665512085, + "eval_mean_token_accuracy": 0.8698753178119659, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.3616, + "eval_samples_per_second": 16.594, + "eval_steps_per_second": 2.076, + "step": 1496 + }, + { + "entropy": 0.27893446536377225, + "epoch": 4.010709504685408, + "grad_norm": 1.545491337776184, + "learning_rate": 9.30742431881587e-05, + "loss": 1.0577442169189453, + "mean_token_accuracy": 0.9191552999645772, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.19769302535802125, + "epoch": 4.144578313253012, + "grad_norm": 2.10296893119812, + "learning_rate": 9.052568051799083e-05, + "loss": 0.7461458587646485, + "mean_token_accuracy": 0.9415343621373177, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.1981763695180416, + "epoch": 4.278447121820616, + "grad_norm": 2.067410945892334, + "learning_rate": 8.791533352632524e-05, + "loss": 0.7580889892578125, + "mean_token_accuracy": 0.9396374526619912, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.19850988369435071, + "epoch": 4.412315930388219, + "grad_norm": 1.9034850597381592, + "learning_rate": 8.524888591065258e-05, + "loss": 0.7526986694335938, + "mean_token_accuracy": 0.9402479353547096, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.19905407220125199, + "epoch": 4.546184738955823, + "grad_norm": 2.1477949619293213, + "learning_rate": 8.253214352041379e-05, + "loss": 0.7603612518310547, + "mean_token_accuracy": 0.9396576225757599, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.20251497332006693, + "epoch": 4.680053547523427, + "grad_norm": 1.5489246845245361, + "learning_rate": 7.97710217155036e-05, + "loss": 0.7711930084228515, + "mean_token_accuracy": 0.9400961664319039, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.1991352306306362, + "epoch": 4.813922356091031, + "grad_norm": 1.969994068145752, + "learning_rate": 7.697153248632946e-05, + "loss": 0.7681967163085938, + "mean_token_accuracy": 0.9399621617794037, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.20229352474212647, + "epoch": 4.947791164658635, + "grad_norm": 2.2329719066619873, + "learning_rate": 7.41397713634694e-05, + "loss": 0.7733911895751953, + "mean_token_accuracy": 0.9396535342931748, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.270584502145648, + "eval_loss": 0.6255385875701904, + "eval_mean_token_accuracy": 0.8687835082411766, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.6331, + "eval_samples_per_second": 16.547, + "eval_steps_per_second": 2.07, + "step": 1870 + }, + { + "entropy": 0.16372355209155517, + "epoch": 5.080321285140562, + "grad_norm": 8.029130935668945, + "learning_rate": 7.128190414543193e-05, + "loss": 0.6145073699951172, + "mean_token_accuracy": 0.9516371590922578, + "num_tokens": 4434412.0, + "step": 1900 + }, + { + "entropy": 0.14057113960385323, + "epoch": 5.214190093708166, + "grad_norm": 2.23626446723938, + "learning_rate": 6.840415347341672e-05, + "loss": 0.5295140075683594, + "mean_token_accuracy": 0.9593333688378334, + "num_tokens": 4548703.0, + "step": 1950 + }, + { + "entropy": 0.14139273861423135, + "epoch": 5.34805890227577, + "grad_norm": 2.0157318115234375, + "learning_rate": 6.551278528230729e-05, + "loss": 0.5296827697753906, + "mean_token_accuracy": 0.9590813705325126, + "num_tokens": 4665542.0, + "step": 2000 + }, + { + "entropy": 0.14537794288247824, + "epoch": 5.481927710843373, + "grad_norm": 1.5371013879776, + "learning_rate": 6.261409515739736e-05, + "loss": 0.5478645706176758, + "mean_token_accuracy": 0.9577724316716194, + "num_tokens": 4778075.0, + "step": 2050 + }, + { + "entropy": 0.14534839443862438, + "epoch": 5.615796519410977, + "grad_norm": 2.0134589672088623, + "learning_rate": 5.971439462655727e-05, + "loss": 0.5426230239868164, + "mean_token_accuracy": 0.9581041479110718, + "num_tokens": 4897453.0, + "step": 2100 + }, + { + "entropy": 0.14614912170916797, + "epoch": 5.749665327978581, + "grad_norm": 1.286437749862671, + "learning_rate": 5.6819997417687274e-05, + "loss": 0.5487421798706055, + "mean_token_accuracy": 0.9563529288768768, + "num_tokens": 5012767.0, + "step": 2150 + }, + { + "entropy": 0.13987606402486563, + "epoch": 5.883534136546185, + "grad_norm": 1.7586702108383179, + "learning_rate": 5.393720571138079e-05, + "loss": 0.5254617309570313, + "mean_token_accuracy": 0.9590577334165573, + "num_tokens": 5129878.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.2240281231701374, + "eval_loss": 0.7485206723213196, + "eval_mean_token_accuracy": 0.8668996468186378, + "eval_num_tokens": 5233482.0, + "eval_runtime": 96.4089, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.074, + "step": 2244 + }, + { + "entropy": 0.1413771447283451, + "epoch": 6.016064257028113, + "grad_norm": 1.2926467657089233, + "learning_rate": 5.1072296418730254e-05, + "loss": 0.5202234649658203, + "mean_token_accuracy": 0.9594009392189257, + "num_tokens": 5246734.0, + "step": 2250 + }, + { + "entropy": 0.1042403375543654, + "epoch": 6.149933065595716, + "grad_norm": 1.9540276527404785, + "learning_rate": 4.8231507514154216e-05, + "loss": 0.39597846984863283, + "mean_token_accuracy": 0.9706364983320236, + "num_tokens": 5366334.0, + "step": 2300 + }, + { + "entropy": 0.10351455600932241, + "epoch": 6.28380187416332, + "grad_norm": 2.139054775238037, + "learning_rate": 4.542102445300397e-05, + "loss": 0.38731266021728517, + "mean_token_accuracy": 0.9703371664881706, + "num_tokens": 5487013.0, + "step": 2350 + }, + { + "entropy": 0.11232182893902064, + "epoch": 6.417670682730924, + "grad_norm": 1.6526401042938232, + "learning_rate": 4.264696670352381e-05, + "loss": 0.42091716766357423, + "mean_token_accuracy": 0.9684987756609916, + "num_tokens": 5599415.0, + "step": 2400 + }, + { + "entropy": 0.10796859875321388, + "epoch": 6.551539491298527, + "grad_norm": 1.297956109046936, + "learning_rate": 3.9915374422489785e-05, + "loss": 0.40640792846679685, + "mean_token_accuracy": 0.9703203043341637, + "num_tokens": 5718099.0, + "step": 2450 + }, + { + "entropy": 0.10999857917428017, + "epoch": 6.685408299866131, + "grad_norm": 1.5105161666870117, + "learning_rate": 3.723219530353909e-05, + "loss": 0.4118352508544922, + "mean_token_accuracy": 0.9697986772656441, + "num_tokens": 5833902.0, + "step": 2500 + }, + { + "entropy": 0.11099046738818288, + "epoch": 6.8192771084337345, + "grad_norm": 1.8809560537338257, + "learning_rate": 3.460327162682602e-05, + "loss": 0.41624794006347654, + "mean_token_accuracy": 0.9690032437443733, + "num_tokens": 5948132.0, + "step": 2550 + }, + { + "entropy": 0.11062245365232229, + "epoch": 6.953145917001339, + "grad_norm": 1.0219827890396118, + "learning_rate": 3.2034327538202464e-05, + "loss": 0.41484325408935546, + "mean_token_accuracy": 0.9690453514456749, + "num_tokens": 6066224.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.18908375523984433, + "eval_loss": 0.8491571545600891, + "eval_mean_token_accuracy": 0.8642131051421166, + "eval_num_tokens": 6105729.0, + "eval_runtime": 96.4633, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.073, + "step": 2618 + }, + { + "entropy": 0.09948956533664405, + "epoch": 7.085676037483267, + "grad_norm": 1.4661338329315186, + "learning_rate": 2.9530956585620777e-05, + "loss": 0.36354263305664064, + "mean_token_accuracy": 0.9727776297415146, + "num_tokens": 6183429.0, + "step": 2650 + }, + { + "entropy": 0.08666609892621636, + "epoch": 7.21954484605087, + "grad_norm": 1.9116477966308594, + "learning_rate": 2.7098609539896744e-05, + "loss": 0.3243706130981445, + "mean_token_accuracy": 0.9765083396434784, + "num_tokens": 6303432.0, + "step": 2700 + }, + { + "entropy": 0.09543853564187885, + "epoch": 7.353413654618474, + "grad_norm": 1.0068918466567993, + "learning_rate": 2.4742582526351715e-05, + "loss": 0.35761878967285154, + "mean_token_accuracy": 0.9740070801973343, + "num_tokens": 6414176.0, + "step": 2750 + }, + { + "entropy": 0.08997446410357952, + "epoch": 7.4872824631860775, + "grad_norm": 1.6730849742889404, + "learning_rate": 2.246800549317553e-05, + "loss": 0.33653587341308594, + "mean_token_accuracy": 0.9758713039755821, + "num_tokens": 6531772.0, + "step": 2800 + }, + { + "entropy": 0.08550533290952445, + "epoch": 7.621151271753681, + "grad_norm": 1.3010321855545044, + "learning_rate": 2.027983104161894e-05, + "loss": 0.3204774856567383, + "mean_token_accuracy": 0.977160106599331, + "num_tokens": 6655745.0, + "step": 2850 + }, + { + "entropy": 0.09146139286458492, + "epoch": 7.755020080321285, + "grad_norm": 2.1133384704589844, + "learning_rate": 1.8182823642336212e-05, + "loss": 0.3351753234863281, + "mean_token_accuracy": 0.9754938682913781, + "num_tokens": 6772303.0, + "step": 2900 + }, + { + "entropy": 0.08813748911023139, + "epoch": 7.888888888888889, + "grad_norm": 0.9765240550041199, + "learning_rate": 1.618154926135836e-05, + "loss": 0.3303861236572266, + "mean_token_accuracy": 0.9758572709560395, + "num_tokens": 6887254.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.153879771232605, + "eval_loss": 1.0034006834030151, + "eval_mean_token_accuracy": 0.8645920944213867, + "eval_num_tokens": 6977976.0, + "eval_runtime": 96.4871, + "eval_samples_per_second": 16.572, + "eval_steps_per_second": 2.073, + "step": 2992 + }, + { + "entropy": 0.09048398275568027, + "epoch": 8.021419009370817, + "grad_norm": 0.4018457531929016, + "learning_rate": 1.4280365418284746e-05, + "loss": 0.3326351547241211, + "mean_token_accuracy": 0.9755137812609624, + "num_tokens": 6997584.0, + "step": 3000 + }, + { + "entropy": 0.08544229088351131, + "epoch": 8.15528781793842, + "grad_norm": 0.552768886089325, + "learning_rate": 1.2483411698340072e-05, + "loss": 0.3177168655395508, + "mean_token_accuracy": 0.977306153178215, + "num_tokens": 7109661.0, + "step": 3050 + }, + { + "entropy": 0.08211908274330199, + "epoch": 8.289156626506024, + "grad_norm": 0.7745324373245239, + "learning_rate": 1.0794600738955833e-05, + "loss": 0.305778751373291, + "mean_token_accuracy": 0.9774795493483543, + "num_tokens": 7228951.0, + "step": 3100 + }, + { + "entropy": 0.07924632488749922, + "epoch": 8.423025435073628, + "grad_norm": 0.6892443299293518, + "learning_rate": 9.217609710501601e-06, + "loss": 0.29681636810302736, + "mean_token_accuracy": 0.9784620434045792, + "num_tokens": 7345974.0, + "step": 3150 + }, + { + "entropy": 0.07959031270816923, + "epoch": 8.556894243641231, + "grad_norm": 1.8224815130233765, + "learning_rate": 7.755872309715688e-06, + "loss": 0.2975615882873535, + "mean_token_accuracy": 0.9780591726303101, + "num_tokens": 7465280.0, + "step": 3200 + }, + { + "entropy": 0.07911164808087051, + "epoch": 8.690763052208835, + "grad_norm": 0.7088468074798584, + "learning_rate": 6.4125712832686665e-06, + "loss": 0.2949537658691406, + "mean_token_accuracy": 0.9787314286828042, + "num_tokens": 7584144.0, + "step": 3250 + }, + { + "entropy": 0.08394345591776073, + "epoch": 8.824631860776439, + "grad_norm": 0.7498103976249695, + "learning_rate": 5.19063149773867e-06, + "loss": 0.3096595764160156, + "mean_token_accuracy": 0.9774689373373985, + "num_tokens": 7698337.0, + "step": 3300 + }, + { + "entropy": 0.08363399875350297, + "epoch": 8.958500669344042, + "grad_norm": 0.5460980534553528, + "learning_rate": 4.092713571087534e-06, + "loss": 0.31484752655029297, + "mean_token_accuracy": 0.9767562291026115, + "num_tokens": 7815006.0, + "step": 3350 + }, + { + "epoch": 9.0, + "eval_entropy": 0.14292236048728227, + "eval_loss": 1.0725034475326538, + "eval_mean_token_accuracy": 0.8639731431007385, + "eval_num_tokens": 7850223.0, + "eval_runtime": 96.5743, + "eval_samples_per_second": 16.557, + "eval_steps_per_second": 2.071, + "step": 3366 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6911646503474606e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2148faba04b3eea9d8bc79cdd2f52c92b8cda9e7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.015034304668777832, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..85594b7a33d4f63b0b612916e22727c8151d4bdd --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-374/trainer_state.json @@ -0,0 +1,115 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 374, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3355020767450332, + "epoch": 0.13386880856760375, + "grad_norm": 3.2956597805023193, + "learning_rate": 1.628530639938585e-05, + "loss": 5.349910278320312, + "mean_token_accuracy": 0.7383818039298058, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5958842460811138, + "epoch": 0.2677376171352075, + "grad_norm": 2.5947492122650146, + "learning_rate": 3.290296599059591e-05, + "loss": 2.312855072021484, + "mean_token_accuracy": 0.8520967712998391, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5190362003445625, + "epoch": 0.40160642570281124, + "grad_norm": 1.5038394927978516, + "learning_rate": 4.9520625581805955e-05, + "loss": 2.0574468994140624, + "mean_token_accuracy": 0.8657039344310761, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4922871346771717, + "epoch": 0.535475234270415, + "grad_norm": 1.645923137664795, + "learning_rate": 6.613828517301602e-05, + "loss": 1.916438446044922, + "mean_token_accuracy": 0.8717759534716606, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.491110111027956, + "epoch": 0.6693440428380187, + "grad_norm": 1.866817593574524, + "learning_rate": 8.275594476422607e-05, + "loss": 1.9421713256835937, + "mean_token_accuracy": 0.8710730043053627, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.47134352535009383, + "epoch": 0.8032128514056225, + "grad_norm": 117.62409210205078, + "learning_rate": 9.937360435543611e-05, + "loss": 1.9768324279785157, + "mean_token_accuracy": 0.8741078078746796, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.4820582258701325, + "epoch": 0.9370816599732262, + "grad_norm": 2.3274827003479004, + "learning_rate": 0.00011599126394664616, + "loss": 2.2025875854492187, + "mean_token_accuracy": 0.8697148504853248, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5010400542616844, + "eval_loss": 0.5114277601242065, + "eval_mean_token_accuracy": 0.8587275749444961, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.5515, + "eval_samples_per_second": 16.561, + "eval_steps_per_second": 2.071, + "step": 374 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.9712111865733606e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2148faba04b3eea9d8bc79cdd2f52c92b8cda9e7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.015034304668777832, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6364c521cb2ab2c74496ee68a0b05029744ec8b7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-3740/trainer_state.json @@ -0,0 +1,884 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 3740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3355020767450332, + "epoch": 0.13386880856760375, + "grad_norm": 3.2956597805023193, + "learning_rate": 1.628530639938585e-05, + "loss": 5.349910278320312, + "mean_token_accuracy": 0.7383818039298058, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5958842460811138, + "epoch": 0.2677376171352075, + "grad_norm": 2.5947492122650146, + "learning_rate": 3.290296599059591e-05, + "loss": 2.312855072021484, + "mean_token_accuracy": 0.8520967712998391, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5190362003445625, + "epoch": 0.40160642570281124, + "grad_norm": 1.5038394927978516, + "learning_rate": 4.9520625581805955e-05, + "loss": 2.0574468994140624, + "mean_token_accuracy": 0.8657039344310761, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4922871346771717, + "epoch": 0.535475234270415, + "grad_norm": 1.645923137664795, + "learning_rate": 6.613828517301602e-05, + "loss": 1.916438446044922, + "mean_token_accuracy": 0.8717759534716606, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.491110111027956, + "epoch": 0.6693440428380187, + "grad_norm": 1.866817593574524, + "learning_rate": 8.275594476422607e-05, + "loss": 1.9421713256835937, + "mean_token_accuracy": 0.8710730043053627, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.47134352535009383, + "epoch": 0.8032128514056225, + "grad_norm": 117.62409210205078, + "learning_rate": 9.937360435543611e-05, + "loss": 1.9768324279785157, + "mean_token_accuracy": 0.8741078078746796, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.4820582258701325, + "epoch": 0.9370816599732262, + "grad_norm": 2.3274827003479004, + "learning_rate": 0.00011599126394664616, + "loss": 2.2025875854492187, + "mean_token_accuracy": 0.8697148504853248, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5010400542616844, + "eval_loss": 0.5114277601242065, + "eval_mean_token_accuracy": 0.8587275749444961, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.5515, + "eval_samples_per_second": 16.561, + "eval_steps_per_second": 2.071, + "step": 374 + }, + { + "entropy": 0.4708875769918615, + "epoch": 1.069611780455154, + "grad_norm": 3.3712940216064453, + "learning_rate": 0.00012428317596508976, + "loss": 1.83294189453125, + "mean_token_accuracy": 0.8772370366737096, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.44804590195417404, + "epoch": 1.2034805890227578, + "grad_norm": 1.4833389520645142, + "learning_rate": 0.00012414788900475706, + "loss": 1.7768891906738282, + "mean_token_accuracy": 0.8791097947955131, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.4510513086616993, + "epoch": 1.3373493975903614, + "grad_norm": 2.814790964126587, + "learning_rate": 0.00012387760965418496, + "loss": 1.7745071411132813, + "mean_token_accuracy": 0.8813075706362724, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.4479117552936077, + "epoch": 1.4712182061579653, + "grad_norm": 1.855610728263855, + "learning_rate": 0.00012347292641217135, + "loss": 1.7583291625976563, + "mean_token_accuracy": 0.8815277495980263, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4380264139175415, + "epoch": 1.605087014725569, + "grad_norm": 1.383190631866455, + "learning_rate": 0.00012293472042483757, + "loss": 1.7229583740234375, + "mean_token_accuracy": 0.8832098203897476, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.4342571949958801, + "epoch": 1.7389558232931726, + "grad_norm": 1.4977834224700928, + "learning_rate": 0.00012226416356704526, + "loss": 1.7174737548828125, + "mean_token_accuracy": 0.8834967383742333, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.42700962007045745, + "epoch": 1.8728246318607764, + "grad_norm": 1.6156537532806396, + "learning_rate": 0.00012146271589078838, + "loss": 1.682061767578125, + "mean_token_accuracy": 0.8858474844694137, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4838937771320343, + "eval_loss": 0.4826815128326416, + "eval_mean_token_accuracy": 0.8682844692468643, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5071, + "eval_samples_per_second": 16.569, + "eval_steps_per_second": 2.072, + "step": 748 + }, + { + "entropy": 0.4378527848407476, + "epoch": 2.005354752342704, + "grad_norm": 1.400229573249817, + "learning_rate": 0.0001205321224461161, + "loss": 1.7096096801757812, + "mean_token_accuracy": 0.8838462468349573, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.3559799794852734, + "epoch": 2.139223560910308, + "grad_norm": 1.7168083190917969, + "learning_rate": 0.0001194744094815093, + "loss": 1.3893603515625, + "mean_token_accuracy": 0.9004731178283691, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3671448823064566, + "epoch": 2.2730923694779115, + "grad_norm": 1.9720135927200317, + "learning_rate": 0.00011829188003198282, + "loss": 1.429988555908203, + "mean_token_accuracy": 0.8970818132162094, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3597494306415319, + "epoch": 2.4069611780455156, + "grad_norm": 1.4947372674942017, + "learning_rate": 0.00011698710890452068, + "loss": 1.418173828125, + "mean_token_accuracy": 0.8994651186466217, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.36254502907395364, + "epoch": 2.540829986613119, + "grad_norm": 1.6768454313278198, + "learning_rate": 0.00011556293707176242, + "loss": 1.4158590698242188, + "mean_token_accuracy": 0.8995477721095085, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.36290778368711474, + "epoch": 2.674698795180723, + "grad_norm": 1.6033697128295898, + "learning_rate": 0.00011402246548614765, + "loss": 1.4300469970703125, + "mean_token_accuracy": 0.8986452376842499, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.3635872249305248, + "epoch": 2.8085676037483265, + "grad_norm": 1.546893835067749, + "learning_rate": 0.00011236904832798785, + "loss": 1.42587646484375, + "mean_token_accuracy": 0.9003903394937516, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.36871150620281695, + "epoch": 2.9424364123159306, + "grad_norm": 1.2951405048370361, + "learning_rate": 0.0001106062857021667, + "loss": 1.448046875, + "mean_token_accuracy": 0.8967258337140084, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.4225208269059658, + "eval_loss": 0.489418089389801, + "eval_mean_token_accuracy": 0.8697815361618996, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.4058, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.075, + "step": 1122 + }, + { + "entropy": 0.3120347365285411, + "epoch": 3.074966532797858, + "grad_norm": 1.639520287513733, + "learning_rate": 0.00010873801579937106, + "loss": 1.1941973876953125, + "mean_token_accuracy": 0.9117801315856703, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.28257040068507194, + "epoch": 3.208835341365462, + "grad_norm": 1.7459681034088135, + "learning_rate": 0.00010676830653892058, + "loss": 1.0850601196289062, + "mean_token_accuracy": 0.9177472350001336, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.27802520349621773, + "epoch": 3.3427041499330654, + "grad_norm": 1.5176103115081787, + "learning_rate": 0.00010470144671139238, + "loss": 1.0840838623046876, + "mean_token_accuracy": 0.9179763168096542, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.280417420566082, + "epoch": 3.4765729585006695, + "grad_norm": 1.3774974346160889, + "learning_rate": 0.00010254193664032686, + "loss": 1.0911756896972655, + "mean_token_accuracy": 0.9162956389784813, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.2834589210152626, + "epoch": 3.610441767068273, + "grad_norm": 1.5929396152496338, + "learning_rate": 0.00010029447838334742, + "loss": 1.0985262298583984, + "mean_token_accuracy": 0.9174074530601501, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.282296127229929, + "epoch": 3.7443105756358768, + "grad_norm": 1.50350022315979, + "learning_rate": 9.796396549403e-05, + "loss": 1.101386260986328, + "mean_token_accuracy": 0.9168545073270797, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.279728781580925, + "epoch": 3.878179384203481, + "grad_norm": 1.4728187322616577, + "learning_rate": 9.555547236681456e-05, + "loss": 1.0859880065917968, + "mean_token_accuracy": 0.9178367125988006, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34304031178355215, + "eval_loss": 0.5295785665512085, + "eval_mean_token_accuracy": 0.8698753178119659, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.3616, + "eval_samples_per_second": 16.594, + "eval_steps_per_second": 2.076, + "step": 1496 + }, + { + "entropy": 0.27893446536377225, + "epoch": 4.010709504685408, + "grad_norm": 1.545491337776184, + "learning_rate": 9.30742431881587e-05, + "loss": 1.0577442169189453, + "mean_token_accuracy": 0.9191552999645772, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.19769302535802125, + "epoch": 4.144578313253012, + "grad_norm": 2.10296893119812, + "learning_rate": 9.052568051799083e-05, + "loss": 0.7461458587646485, + "mean_token_accuracy": 0.9415343621373177, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.1981763695180416, + "epoch": 4.278447121820616, + "grad_norm": 2.067410945892334, + "learning_rate": 8.791533352632524e-05, + "loss": 0.7580889892578125, + "mean_token_accuracy": 0.9396374526619912, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.19850988369435071, + "epoch": 4.412315930388219, + "grad_norm": 1.9034850597381592, + "learning_rate": 8.524888591065258e-05, + "loss": 0.7526986694335938, + "mean_token_accuracy": 0.9402479353547096, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.19905407220125199, + "epoch": 4.546184738955823, + "grad_norm": 2.1477949619293213, + "learning_rate": 8.253214352041379e-05, + "loss": 0.7603612518310547, + "mean_token_accuracy": 0.9396576225757599, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.20251497332006693, + "epoch": 4.680053547523427, + "grad_norm": 1.5489246845245361, + "learning_rate": 7.97710217155036e-05, + "loss": 0.7711930084228515, + "mean_token_accuracy": 0.9400961664319039, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.1991352306306362, + "epoch": 4.813922356091031, + "grad_norm": 1.969994068145752, + "learning_rate": 7.697153248632946e-05, + "loss": 0.7681967163085938, + "mean_token_accuracy": 0.9399621617794037, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.20229352474212647, + "epoch": 4.947791164658635, + "grad_norm": 2.2329719066619873, + "learning_rate": 7.41397713634694e-05, + "loss": 0.7733911895751953, + "mean_token_accuracy": 0.9396535342931748, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.270584502145648, + "eval_loss": 0.6255385875701904, + "eval_mean_token_accuracy": 0.8687835082411766, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.6331, + "eval_samples_per_second": 16.547, + "eval_steps_per_second": 2.07, + "step": 1870 + }, + { + "entropy": 0.16372355209155517, + "epoch": 5.080321285140562, + "grad_norm": 8.029130935668945, + "learning_rate": 7.128190414543193e-05, + "loss": 0.6145073699951172, + "mean_token_accuracy": 0.9516371590922578, + "num_tokens": 4434412.0, + "step": 1900 + }, + { + "entropy": 0.14057113960385323, + "epoch": 5.214190093708166, + "grad_norm": 2.23626446723938, + "learning_rate": 6.840415347341672e-05, + "loss": 0.5295140075683594, + "mean_token_accuracy": 0.9593333688378334, + "num_tokens": 4548703.0, + "step": 1950 + }, + { + "entropy": 0.14139273861423135, + "epoch": 5.34805890227577, + "grad_norm": 2.0157318115234375, + "learning_rate": 6.551278528230729e-05, + "loss": 0.5296827697753906, + "mean_token_accuracy": 0.9590813705325126, + "num_tokens": 4665542.0, + "step": 2000 + }, + { + "entropy": 0.14537794288247824, + "epoch": 5.481927710843373, + "grad_norm": 1.5371013879776, + "learning_rate": 6.261409515739736e-05, + "loss": 0.5478645706176758, + "mean_token_accuracy": 0.9577724316716194, + "num_tokens": 4778075.0, + "step": 2050 + }, + { + "entropy": 0.14534839443862438, + "epoch": 5.615796519410977, + "grad_norm": 2.0134589672088623, + "learning_rate": 5.971439462655727e-05, + "loss": 0.5426230239868164, + "mean_token_accuracy": 0.9581041479110718, + "num_tokens": 4897453.0, + "step": 2100 + }, + { + "entropy": 0.14614912170916797, + "epoch": 5.749665327978581, + "grad_norm": 1.286437749862671, + "learning_rate": 5.6819997417687274e-05, + "loss": 0.5487421798706055, + "mean_token_accuracy": 0.9563529288768768, + "num_tokens": 5012767.0, + "step": 2150 + }, + { + "entropy": 0.13987606402486563, + "epoch": 5.883534136546185, + "grad_norm": 1.7586702108383179, + "learning_rate": 5.393720571138079e-05, + "loss": 0.5254617309570313, + "mean_token_accuracy": 0.9590577334165573, + "num_tokens": 5129878.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.2240281231701374, + "eval_loss": 0.7485206723213196, + "eval_mean_token_accuracy": 0.8668996468186378, + "eval_num_tokens": 5233482.0, + "eval_runtime": 96.4089, + "eval_samples_per_second": 16.586, + "eval_steps_per_second": 2.074, + "step": 2244 + }, + { + "entropy": 0.1413771447283451, + "epoch": 6.016064257028113, + "grad_norm": 1.2926467657089233, + "learning_rate": 5.1072296418730254e-05, + "loss": 0.5202234649658203, + "mean_token_accuracy": 0.9594009392189257, + "num_tokens": 5246734.0, + "step": 2250 + }, + { + "entropy": 0.1042403375543654, + "epoch": 6.149933065595716, + "grad_norm": 1.9540276527404785, + "learning_rate": 4.8231507514154216e-05, + "loss": 0.39597846984863283, + "mean_token_accuracy": 0.9706364983320236, + "num_tokens": 5366334.0, + "step": 2300 + }, + { + "entropy": 0.10351455600932241, + "epoch": 6.28380187416332, + "grad_norm": 2.139054775238037, + "learning_rate": 4.542102445300397e-05, + "loss": 0.38731266021728517, + "mean_token_accuracy": 0.9703371664881706, + "num_tokens": 5487013.0, + "step": 2350 + }, + { + "entropy": 0.11232182893902064, + "epoch": 6.417670682730924, + "grad_norm": 1.6526401042938232, + "learning_rate": 4.264696670352381e-05, + "loss": 0.42091716766357423, + "mean_token_accuracy": 0.9684987756609916, + "num_tokens": 5599415.0, + "step": 2400 + }, + { + "entropy": 0.10796859875321388, + "epoch": 6.551539491298527, + "grad_norm": 1.297956109046936, + "learning_rate": 3.9915374422489785e-05, + "loss": 0.40640792846679685, + "mean_token_accuracy": 0.9703203043341637, + "num_tokens": 5718099.0, + "step": 2450 + }, + { + "entropy": 0.10999857917428017, + "epoch": 6.685408299866131, + "grad_norm": 1.5105161666870117, + "learning_rate": 3.723219530353909e-05, + "loss": 0.4118352508544922, + "mean_token_accuracy": 0.9697986772656441, + "num_tokens": 5833902.0, + "step": 2500 + }, + { + "entropy": 0.11099046738818288, + "epoch": 6.8192771084337345, + "grad_norm": 1.8809560537338257, + "learning_rate": 3.460327162682602e-05, + "loss": 0.41624794006347654, + "mean_token_accuracy": 0.9690032437443733, + "num_tokens": 5948132.0, + "step": 2550 + }, + { + "entropy": 0.11062245365232229, + "epoch": 6.953145917001339, + "grad_norm": 1.0219827890396118, + "learning_rate": 3.2034327538202464e-05, + "loss": 0.41484325408935546, + "mean_token_accuracy": 0.9690453514456749, + "num_tokens": 6066224.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.18908375523984433, + "eval_loss": 0.8491571545600891, + "eval_mean_token_accuracy": 0.8642131051421166, + "eval_num_tokens": 6105729.0, + "eval_runtime": 96.4633, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.073, + "step": 2618 + }, + { + "entropy": 0.09948956533664405, + "epoch": 7.085676037483267, + "grad_norm": 1.4661338329315186, + "learning_rate": 2.9530956585620777e-05, + "loss": 0.36354263305664064, + "mean_token_accuracy": 0.9727776297415146, + "num_tokens": 6183429.0, + "step": 2650 + }, + { + "entropy": 0.08666609892621636, + "epoch": 7.21954484605087, + "grad_norm": 1.9116477966308594, + "learning_rate": 2.7098609539896744e-05, + "loss": 0.3243706130981445, + "mean_token_accuracy": 0.9765083396434784, + "num_tokens": 6303432.0, + "step": 2700 + }, + { + "entropy": 0.09543853564187885, + "epoch": 7.353413654618474, + "grad_norm": 1.0068918466567993, + "learning_rate": 2.4742582526351715e-05, + "loss": 0.35761878967285154, + "mean_token_accuracy": 0.9740070801973343, + "num_tokens": 6414176.0, + "step": 2750 + }, + { + "entropy": 0.08997446410357952, + "epoch": 7.4872824631860775, + "grad_norm": 1.6730849742889404, + "learning_rate": 2.246800549317553e-05, + "loss": 0.33653587341308594, + "mean_token_accuracy": 0.9758713039755821, + "num_tokens": 6531772.0, + "step": 2800 + }, + { + "entropy": 0.08550533290952445, + "epoch": 7.621151271753681, + "grad_norm": 1.3010321855545044, + "learning_rate": 2.027983104161894e-05, + "loss": 0.3204774856567383, + "mean_token_accuracy": 0.977160106599331, + "num_tokens": 6655745.0, + "step": 2850 + }, + { + "entropy": 0.09146139286458492, + "epoch": 7.755020080321285, + "grad_norm": 2.1133384704589844, + "learning_rate": 1.8182823642336212e-05, + "loss": 0.3351753234863281, + "mean_token_accuracy": 0.9754938682913781, + "num_tokens": 6772303.0, + "step": 2900 + }, + { + "entropy": 0.08813748911023139, + "epoch": 7.888888888888889, + "grad_norm": 0.9765240550041199, + "learning_rate": 1.618154926135836e-05, + "loss": 0.3303861236572266, + "mean_token_accuracy": 0.9758572709560395, + "num_tokens": 6887254.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.153879771232605, + "eval_loss": 1.0034006834030151, + "eval_mean_token_accuracy": 0.8645920944213867, + "eval_num_tokens": 6977976.0, + "eval_runtime": 96.4871, + "eval_samples_per_second": 16.572, + "eval_steps_per_second": 2.073, + "step": 2992 + }, + { + "entropy": 0.09048398275568027, + "epoch": 8.021419009370817, + "grad_norm": 0.4018457531929016, + "learning_rate": 1.4280365418284746e-05, + "loss": 0.3326351547241211, + "mean_token_accuracy": 0.9755137812609624, + "num_tokens": 6997584.0, + "step": 3000 + }, + { + "entropy": 0.08544229088351131, + "epoch": 8.15528781793842, + "grad_norm": 0.552768886089325, + "learning_rate": 1.2483411698340072e-05, + "loss": 0.3177168655395508, + "mean_token_accuracy": 0.977306153178215, + "num_tokens": 7109661.0, + "step": 3050 + }, + { + "entropy": 0.08211908274330199, + "epoch": 8.289156626506024, + "grad_norm": 0.7745324373245239, + "learning_rate": 1.0794600738955833e-05, + "loss": 0.305778751373291, + "mean_token_accuracy": 0.9774795493483543, + "num_tokens": 7228951.0, + "step": 3100 + }, + { + "entropy": 0.07924632488749922, + "epoch": 8.423025435073628, + "grad_norm": 0.6892443299293518, + "learning_rate": 9.217609710501601e-06, + "loss": 0.29681636810302736, + "mean_token_accuracy": 0.9784620434045792, + "num_tokens": 7345974.0, + "step": 3150 + }, + { + "entropy": 0.07959031270816923, + "epoch": 8.556894243641231, + "grad_norm": 1.8224815130233765, + "learning_rate": 7.755872309715688e-06, + "loss": 0.2975615882873535, + "mean_token_accuracy": 0.9780591726303101, + "num_tokens": 7465280.0, + "step": 3200 + }, + { + "entropy": 0.07911164808087051, + "epoch": 8.690763052208835, + "grad_norm": 0.7088468074798584, + "learning_rate": 6.4125712832686665e-06, + "loss": 0.2949537658691406, + "mean_token_accuracy": 0.9787314286828042, + "num_tokens": 7584144.0, + "step": 3250 + }, + { + "entropy": 0.08394345591776073, + "epoch": 8.824631860776439, + "grad_norm": 0.7498103976249695, + "learning_rate": 5.19063149773867e-06, + "loss": 0.3096595764160156, + "mean_token_accuracy": 0.9774689373373985, + "num_tokens": 7698337.0, + "step": 3300 + }, + { + "entropy": 0.08363399875350297, + "epoch": 8.958500669344042, + "grad_norm": 0.5460980534553528, + "learning_rate": 4.092713571087534e-06, + "loss": 0.31484752655029297, + "mean_token_accuracy": 0.9767562291026115, + "num_tokens": 7815006.0, + "step": 3350 + }, + { + "epoch": 9.0, + "eval_entropy": 0.14292236048728227, + "eval_loss": 1.0725034475326538, + "eval_mean_token_accuracy": 0.8639731431007385, + "eval_num_tokens": 7850223.0, + "eval_runtime": 96.5743, + "eval_samples_per_second": 16.557, + "eval_steps_per_second": 2.071, + "step": 3366 + }, + { + "entropy": 0.08699184825474565, + "epoch": 9.09103078982597, + "grad_norm": 0.4920552670955658, + "learning_rate": 3.1212080795047673e-06, + "loss": 0.3132004165649414, + "mean_token_accuracy": 0.9772917390471757, + "num_tokens": 7924040.0, + "step": 3400 + }, + { + "entropy": 0.07937462277710437, + "epoch": 9.224899598393574, + "grad_norm": 0.5245408415794373, + "learning_rate": 2.278230352232899e-06, + "loss": 0.2948256874084473, + "mean_token_accuracy": 0.9782775729894638, + "num_tokens": 8035493.0, + "step": 3450 + }, + { + "entropy": 0.08003638771362603, + "epoch": 9.358768406961179, + "grad_norm": 0.5275429487228394, + "learning_rate": 1.5656158657080147e-06, + "loss": 0.2957208824157715, + "mean_token_accuracy": 0.9780283415317536, + "num_tokens": 8150965.0, + "step": 3500 + }, + { + "entropy": 0.07463583857752383, + "epoch": 9.492637215528783, + "grad_norm": 0.41151365637779236, + "learning_rate": 9.849162470439522e-07, + "loss": 0.2795832633972168, + "mean_token_accuracy": 0.9797070980072021, + "num_tokens": 8273746.0, + "step": 3550 + }, + { + "entropy": 0.07939885665662587, + "epoch": 9.626506024096386, + "grad_norm": 0.6897122859954834, + "learning_rate": 5.373958955623113e-07, + "loss": 0.29586851119995117, + "mean_token_accuracy": 0.9783321896195412, + "num_tokens": 8391652.0, + "step": 3600 + }, + { + "entropy": 0.0767002559825778, + "epoch": 9.76037483266399, + "grad_norm": 0.9138604998588562, + "learning_rate": 2.240292297242689e-07, + "loss": 0.28384410858154296, + "mean_token_accuracy": 0.9795003500580788, + "num_tokens": 8509546.0, + "step": 3650 + }, + { + "entropy": 0.07247084728442132, + "epoch": 9.894243641231594, + "grad_norm": 0.5386644601821899, + "learning_rate": 4.549856545868282e-08, + "loss": 0.2697745323181152, + "mean_token_accuracy": 0.9802604866027832, + "num_tokens": 8634626.0, + "step": 3700 + }, + { + "epoch": 10.0, + "eval_entropy": 0.1328240069001913, + "eval_loss": 1.144450306892395, + "eval_mean_token_accuracy": 0.8635567063093186, + "eval_num_tokens": 8722470.0, + "eval_runtime": 96.7758, + "eval_samples_per_second": 16.523, + "eval_steps_per_second": 2.067, + "step": 3740 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.9909736944491034e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2148faba04b3eea9d8bc79cdd2f52c92b8cda9e7 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.015034304668777832, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d4413e450433ac499cb4144e230d4444e246e487 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-748/trainer_state.json @@ -0,0 +1,196 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 748, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3355020767450332, + "epoch": 0.13386880856760375, + "grad_norm": 3.2956597805023193, + "learning_rate": 1.628530639938585e-05, + "loss": 5.349910278320312, + "mean_token_accuracy": 0.7383818039298058, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5958842460811138, + "epoch": 0.2677376171352075, + "grad_norm": 2.5947492122650146, + "learning_rate": 3.290296599059591e-05, + "loss": 2.312855072021484, + "mean_token_accuracy": 0.8520967712998391, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5190362003445625, + "epoch": 0.40160642570281124, + "grad_norm": 1.5038394927978516, + "learning_rate": 4.9520625581805955e-05, + "loss": 2.0574468994140624, + "mean_token_accuracy": 0.8657039344310761, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4922871346771717, + "epoch": 0.535475234270415, + "grad_norm": 1.645923137664795, + "learning_rate": 6.613828517301602e-05, + "loss": 1.916438446044922, + "mean_token_accuracy": 0.8717759534716606, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.491110111027956, + "epoch": 0.6693440428380187, + "grad_norm": 1.866817593574524, + "learning_rate": 8.275594476422607e-05, + "loss": 1.9421713256835937, + "mean_token_accuracy": 0.8710730043053627, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.47134352535009383, + "epoch": 0.8032128514056225, + "grad_norm": 117.62409210205078, + "learning_rate": 9.937360435543611e-05, + "loss": 1.9768324279785157, + "mean_token_accuracy": 0.8741078078746796, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.4820582258701325, + "epoch": 0.9370816599732262, + "grad_norm": 2.3274827003479004, + "learning_rate": 0.00011599126394664616, + "loss": 2.2025875854492187, + "mean_token_accuracy": 0.8697148504853248, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5010400542616844, + "eval_loss": 0.5114277601242065, + "eval_mean_token_accuracy": 0.8587275749444961, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.5515, + "eval_samples_per_second": 16.561, + "eval_steps_per_second": 2.071, + "step": 374 + }, + { + "entropy": 0.4708875769918615, + "epoch": 1.069611780455154, + "grad_norm": 3.3712940216064453, + "learning_rate": 0.00012428317596508976, + "loss": 1.83294189453125, + "mean_token_accuracy": 0.8772370366737096, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.44804590195417404, + "epoch": 1.2034805890227578, + "grad_norm": 1.4833389520645142, + "learning_rate": 0.00012414788900475706, + "loss": 1.7768891906738282, + "mean_token_accuracy": 0.8791097947955131, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.4510513086616993, + "epoch": 1.3373493975903614, + "grad_norm": 2.814790964126587, + "learning_rate": 0.00012387760965418496, + "loss": 1.7745071411132813, + "mean_token_accuracy": 0.8813075706362724, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.4479117552936077, + "epoch": 1.4712182061579653, + "grad_norm": 1.855610728263855, + "learning_rate": 0.00012347292641217135, + "loss": 1.7583291625976563, + "mean_token_accuracy": 0.8815277495980263, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4380264139175415, + "epoch": 1.605087014725569, + "grad_norm": 1.383190631866455, + "learning_rate": 0.00012293472042483757, + "loss": 1.7229583740234375, + "mean_token_accuracy": 0.8832098203897476, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.4342571949958801, + "epoch": 1.7389558232931726, + "grad_norm": 1.4977834224700928, + "learning_rate": 0.00012226416356704526, + "loss": 1.7174737548828125, + "mean_token_accuracy": 0.8834967383742333, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.42700962007045745, + "epoch": 1.8728246318607764, + "grad_norm": 1.6156537532806396, + "learning_rate": 0.00012146271589078838, + "loss": 1.682061767578125, + "mean_token_accuracy": 0.8858474844694137, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4838937771320343, + "eval_loss": 0.4826815128326416, + "eval_mean_token_accuracy": 0.8682844692468643, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5071, + "eval_samples_per_second": 16.569, + "eval_steps_per_second": 2.072, + "step": 748 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.968365525090723e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c55b738fe5c4e833530a57ccb4e434a28983796e --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/README.md @@ -0,0 +1,58 @@ +--- +base_model: google/gemma-4-31B +library_name: transformers +model_name: gemma-4-31B_original_features_structural_train_original_features_structural_test2 +tags: +- generated_from_trainer +- sft +- trl +licence: license +--- + +# Model Card for gemma-4-31B_original_features_structural_train_original_features_structural_test2 + +This model is a fine-tuned version of [google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/xuz5975p) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.0 +- Transformers: 5.5.4 +- Pytorch: 2.10.0 +- Datasets: 4.6.1 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9abe250820de6d55106ad056cc8dddd15cd6bd60 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05026173039334608, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..959d2f8a2c527013315d4760e1f067117eb36984 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1122/trainer_state.json @@ -0,0 +1,287 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1122, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2582822921872139, + "epoch": 0.13386880856760375, + "grad_norm": 5.033235549926758, + "learning_rate": 2.1466288485778066e-05, + "loss": 5.046328735351563, + "mean_token_accuracy": 0.7502484863996506, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5685664692521095, + "epoch": 0.2677376171352075, + "grad_norm": 3.554903030395508, + "learning_rate": 4.337066449167405e-05, + "loss": 2.209185791015625, + "mean_token_accuracy": 0.8581047981977463, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5092987871170044, + "epoch": 0.40160642570281124, + "grad_norm": 5.598084926605225, + "learning_rate": 6.527504049757005e-05, + "loss": 2.0146630859375, + "mean_token_accuracy": 0.8670356649160386, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4832092320919037, + "epoch": 0.535475234270415, + "grad_norm": 3.579439163208008, + "learning_rate": 8.717941650346603e-05, + "loss": 1.9066175842285156, + "mean_token_accuracy": 0.8735517236590385, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.49370287612080577, + "epoch": 0.6693440428380187, + "grad_norm": 22.771854400634766, + "learning_rate": 0.00010908379250936202, + "loss": 1.9317852783203124, + "mean_token_accuracy": 0.8717742815613747, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.475419160425663, + "epoch": 0.8032128514056225, + "grad_norm": 191.79444885253906, + "learning_rate": 0.000130988168515258, + "loss": 1.9097901916503905, + "mean_token_accuracy": 0.8744278407096863, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.5022796393930912, + "epoch": 0.9370816599732262, + "grad_norm": 3.216644287109375, + "learning_rate": 0.00015289254452115398, + "loss": 2.147085876464844, + "mean_token_accuracy": 0.869993035197258, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5250271247327327, + "eval_loss": 0.5184861421585083, + "eval_mean_token_accuracy": 0.8612929663062096, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.6309, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 2.07, + "step": 374 + }, + { + "entropy": 0.4808884654382263, + "epoch": 1.069611780455154, + "grad_norm": 2.666057586669922, + "learning_rate": 0.00016382243255158818, + "loss": 1.8543489074707031, + "mean_token_accuracy": 0.8768212256407497, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.45330303743481637, + "epoch": 1.2034805890227578, + "grad_norm": 2.1171557903289795, + "learning_rate": 0.00016364410560779942, + "loss": 1.7948956298828125, + "mean_token_accuracy": 0.8791207140684127, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.45347678795456886, + "epoch": 1.3373493975903614, + "grad_norm": 4.511318683624268, + "learning_rate": 0.00016328784000438723, + "loss": 1.7988812255859374, + "mean_token_accuracy": 0.8801145932078361, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.44952490359544756, + "epoch": 1.4712182061579653, + "grad_norm": 2.961987257003784, + "learning_rate": 0.0001627544114642431, + "loss": 1.7823495483398437, + "mean_token_accuracy": 0.8799195346236229, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4502506497502327, + "epoch": 1.605087014725569, + "grad_norm": 2.924865484237671, + "learning_rate": 0.000162044981459947, + "loss": 1.7603852844238281, + "mean_token_accuracy": 0.8811277949810028, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.44528300017118455, + "epoch": 1.7389558232931726, + "grad_norm": 2.928840160369873, + "learning_rate": 0.00016116109468480906, + "loss": 1.7513160705566406, + "mean_token_accuracy": 0.8816375133395195, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.43578719861805437, + "epoch": 1.8728246318607764, + "grad_norm": 15.26456356048584, + "learning_rate": 0.00016010467568949708, + "loss": 1.7112632751464845, + "mean_token_accuracy": 0.884103564620018, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4700849764049053, + "eval_loss": 0.49145790934562683, + "eval_mean_token_accuracy": 0.8681637060642242, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5704, + "eval_samples_per_second": 16.548, + "eval_steps_per_second": 2.071, + "step": 748 + }, + { + "entropy": 0.44389665202058926, + "epoch": 2.005354752342704, + "grad_norm": 3.126400947570801, + "learning_rate": 0.00015887802469157283, + "loss": 1.74362060546875, + "mean_token_accuracy": 0.882077858604566, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.36593644849956036, + "epoch": 2.139223560910308, + "grad_norm": 4.825572490692139, + "learning_rate": 0.000157483812567062, + "loss": 1.4261384582519532, + "mean_token_accuracy": 0.897953551709652, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3735161118209362, + "epoch": 2.2730923694779115, + "grad_norm": 2.341724395751953, + "learning_rate": 0.00015592507503496244, + "loss": 1.4566732788085937, + "mean_token_accuracy": 0.8954837635159493, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3712782260775566, + "epoch": 2.4069611780455156, + "grad_norm": 2.141064405441284, + "learning_rate": 0.00015420520604735334, + "loss": 1.4417454528808593, + "mean_token_accuracy": 0.8987472346425056, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.3738844521343708, + "epoch": 2.540829986613119, + "grad_norm": 2.904395818710327, + "learning_rate": 0.0001523279503994976, + "loss": 1.4441893005371094, + "mean_token_accuracy": 0.8981871575117111, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.3702578065544367, + "epoch": 2.674698795180723, + "grad_norm": 2.941880226135254, + "learning_rate": 0.00015029739557602818, + "loss": 1.4411444091796874, + "mean_token_accuracy": 0.8977779766917229, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.37640198186039925, + "epoch": 2.8085676037483265, + "grad_norm": 4.814720153808594, + "learning_rate": 0.00014811796285097166, + "loss": 1.463765869140625, + "mean_token_accuracy": 0.8968957820534706, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.3884583811461926, + "epoch": 2.9424364123159306, + "grad_norm": 2.023144483566284, + "learning_rate": 0.0001457943976609884, + "loss": 1.4860101318359376, + "mean_token_accuracy": 0.8945580047369003, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.41501702144742014, + "eval_loss": 0.5103150010108948, + "eval_mean_token_accuracy": 0.8657891270518303, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.5282, + "eval_samples_per_second": 16.555, + "eval_steps_per_second": 2.072, + "step": 1122 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.906910982888778e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9abe250820de6d55106ad056cc8dddd15cd6bd60 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05026173039334608, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d5a55f4fb938957c63e018aa3264c96b476a6578 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1496/trainer_state.json @@ -0,0 +1,368 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 1496, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2582822921872139, + "epoch": 0.13386880856760375, + "grad_norm": 5.033235549926758, + "learning_rate": 2.1466288485778066e-05, + "loss": 5.046328735351563, + "mean_token_accuracy": 0.7502484863996506, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5685664692521095, + "epoch": 0.2677376171352075, + "grad_norm": 3.554903030395508, + "learning_rate": 4.337066449167405e-05, + "loss": 2.209185791015625, + "mean_token_accuracy": 0.8581047981977463, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5092987871170044, + "epoch": 0.40160642570281124, + "grad_norm": 5.598084926605225, + "learning_rate": 6.527504049757005e-05, + "loss": 2.0146630859375, + "mean_token_accuracy": 0.8670356649160386, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4832092320919037, + "epoch": 0.535475234270415, + "grad_norm": 3.579439163208008, + "learning_rate": 8.717941650346603e-05, + "loss": 1.9066175842285156, + "mean_token_accuracy": 0.8735517236590385, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.49370287612080577, + "epoch": 0.6693440428380187, + "grad_norm": 22.771854400634766, + "learning_rate": 0.00010908379250936202, + "loss": 1.9317852783203124, + "mean_token_accuracy": 0.8717742815613747, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.475419160425663, + "epoch": 0.8032128514056225, + "grad_norm": 191.79444885253906, + "learning_rate": 0.000130988168515258, + "loss": 1.9097901916503905, + "mean_token_accuracy": 0.8744278407096863, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.5022796393930912, + "epoch": 0.9370816599732262, + "grad_norm": 3.216644287109375, + "learning_rate": 0.00015289254452115398, + "loss": 2.147085876464844, + "mean_token_accuracy": 0.869993035197258, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5250271247327327, + "eval_loss": 0.5184861421585083, + "eval_mean_token_accuracy": 0.8612929663062096, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.6309, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 2.07, + "step": 374 + }, + { + "entropy": 0.4808884654382263, + "epoch": 1.069611780455154, + "grad_norm": 2.666057586669922, + "learning_rate": 0.00016382243255158818, + "loss": 1.8543489074707031, + "mean_token_accuracy": 0.8768212256407497, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.45330303743481637, + "epoch": 1.2034805890227578, + "grad_norm": 2.1171557903289795, + "learning_rate": 0.00016364410560779942, + "loss": 1.7948956298828125, + "mean_token_accuracy": 0.8791207140684127, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.45347678795456886, + "epoch": 1.3373493975903614, + "grad_norm": 4.511318683624268, + "learning_rate": 0.00016328784000438723, + "loss": 1.7988812255859374, + "mean_token_accuracy": 0.8801145932078361, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.44952490359544756, + "epoch": 1.4712182061579653, + "grad_norm": 2.961987257003784, + "learning_rate": 0.0001627544114642431, + "loss": 1.7823495483398437, + "mean_token_accuracy": 0.8799195346236229, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4502506497502327, + "epoch": 1.605087014725569, + "grad_norm": 2.924865484237671, + "learning_rate": 0.000162044981459947, + "loss": 1.7603852844238281, + "mean_token_accuracy": 0.8811277949810028, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.44528300017118455, + "epoch": 1.7389558232931726, + "grad_norm": 2.928840160369873, + "learning_rate": 0.00016116109468480906, + "loss": 1.7513160705566406, + "mean_token_accuracy": 0.8816375133395195, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.43578719861805437, + "epoch": 1.8728246318607764, + "grad_norm": 15.26456356048584, + "learning_rate": 0.00016010467568949708, + "loss": 1.7112632751464845, + "mean_token_accuracy": 0.884103564620018, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4700849764049053, + "eval_loss": 0.49145790934562683, + "eval_mean_token_accuracy": 0.8681637060642242, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5704, + "eval_samples_per_second": 16.548, + "eval_steps_per_second": 2.071, + "step": 748 + }, + { + "entropy": 0.44389665202058926, + "epoch": 2.005354752342704, + "grad_norm": 3.126400947570801, + "learning_rate": 0.00015887802469157283, + "loss": 1.74362060546875, + "mean_token_accuracy": 0.882077858604566, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.36593644849956036, + "epoch": 2.139223560910308, + "grad_norm": 4.825572490692139, + "learning_rate": 0.000157483812567062, + "loss": 1.4261384582519532, + "mean_token_accuracy": 0.897953551709652, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3735161118209362, + "epoch": 2.2730923694779115, + "grad_norm": 2.341724395751953, + "learning_rate": 0.00015592507503496244, + "loss": 1.4566732788085937, + "mean_token_accuracy": 0.8954837635159493, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3712782260775566, + "epoch": 2.4069611780455156, + "grad_norm": 2.141064405441284, + "learning_rate": 0.00015420520604735334, + "loss": 1.4417454528808593, + "mean_token_accuracy": 0.8987472346425056, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.3738844521343708, + "epoch": 2.540829986613119, + "grad_norm": 2.904395818710327, + "learning_rate": 0.0001523279503994976, + "loss": 1.4441893005371094, + "mean_token_accuracy": 0.8981871575117111, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.3702578065544367, + "epoch": 2.674698795180723, + "grad_norm": 2.941880226135254, + "learning_rate": 0.00015029739557602818, + "loss": 1.4411444091796874, + "mean_token_accuracy": 0.8977779766917229, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.37640198186039925, + "epoch": 2.8085676037483265, + "grad_norm": 4.814720153808594, + "learning_rate": 0.00014811796285097166, + "loss": 1.463765869140625, + "mean_token_accuracy": 0.8968957820534706, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.3884583811461926, + "epoch": 2.9424364123159306, + "grad_norm": 2.023144483566284, + "learning_rate": 0.0001457943976609884, + "loss": 1.4860101318359376, + "mean_token_accuracy": 0.8945580047369003, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.41501702144742014, + "eval_loss": 0.5103150010108948, + "eval_mean_token_accuracy": 0.8657891270518303, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.5282, + "eval_samples_per_second": 16.555, + "eval_steps_per_second": 2.072, + "step": 1122 + }, + { + "entropy": 0.33624990103822766, + "epoch": 3.074966532797858, + "grad_norm": 2.8879244327545166, + "learning_rate": 0.0001433317592727896, + "loss": 1.2471446990966797, + "mean_token_accuracy": 0.9082047313150733, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.3088844185322523, + "epoch": 3.208835341365462, + "grad_norm": 2.8509788513183594, + "learning_rate": 0.00014073540976722957, + "loss": 1.1441875457763673, + "mean_token_accuracy": 0.9140481147170066, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.30224390886723995, + "epoch": 3.3427041499330654, + "grad_norm": 2.5208239555358887, + "learning_rate": 0.00013801100236405915, + "loss": 1.1275232696533204, + "mean_token_accuracy": 0.9146806076169014, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.295175199881196, + "epoch": 3.4765729585006695, + "grad_norm": 2.4297444820404053, + "learning_rate": 0.00013516446911276066, + "loss": 1.1239344787597656, + "mean_token_accuracy": 0.9151004731655121, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.295776079967618, + "epoch": 3.610441767068273, + "grad_norm": 2.3972175121307373, + "learning_rate": 0.00013220200797626748, + "loss": 1.148626480102539, + "mean_token_accuracy": 0.9141753858327866, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.2951671688258648, + "epoch": 3.7443105756358768, + "grad_norm": 2.1329967975616455, + "learning_rate": 0.00012913006933569033, + "loss": 1.1505547332763673, + "mean_token_accuracy": 0.9145446908473969, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.28995474845170977, + "epoch": 3.878179384203481, + "grad_norm": 2.111231803894043, + "learning_rate": 0.0001259553419454356, + "loss": 1.12584228515625, + "mean_token_accuracy": 0.9153258377313613, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34918407052755357, + "eval_loss": 0.5466129183769226, + "eval_mean_token_accuracy": 0.8677999797463417, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.4025, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.075, + "step": 1496 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1874590853383002e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9abe250820de6d55106ad056cc8dddd15cd6bd60 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05026173039334608, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..107f041ea655da0ed5c4590dd66b6c2a00c1e982 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1870/trainer_state.json @@ -0,0 +1,459 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1870, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2582822921872139, + "epoch": 0.13386880856760375, + "grad_norm": 5.033235549926758, + "learning_rate": 2.1466288485778066e-05, + "loss": 5.046328735351563, + "mean_token_accuracy": 0.7502484863996506, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5685664692521095, + "epoch": 0.2677376171352075, + "grad_norm": 3.554903030395508, + "learning_rate": 4.337066449167405e-05, + "loss": 2.209185791015625, + "mean_token_accuracy": 0.8581047981977463, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5092987871170044, + "epoch": 0.40160642570281124, + "grad_norm": 5.598084926605225, + "learning_rate": 6.527504049757005e-05, + "loss": 2.0146630859375, + "mean_token_accuracy": 0.8670356649160386, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4832092320919037, + "epoch": 0.535475234270415, + "grad_norm": 3.579439163208008, + "learning_rate": 8.717941650346603e-05, + "loss": 1.9066175842285156, + "mean_token_accuracy": 0.8735517236590385, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.49370287612080577, + "epoch": 0.6693440428380187, + "grad_norm": 22.771854400634766, + "learning_rate": 0.00010908379250936202, + "loss": 1.9317852783203124, + "mean_token_accuracy": 0.8717742815613747, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.475419160425663, + "epoch": 0.8032128514056225, + "grad_norm": 191.79444885253906, + "learning_rate": 0.000130988168515258, + "loss": 1.9097901916503905, + "mean_token_accuracy": 0.8744278407096863, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.5022796393930912, + "epoch": 0.9370816599732262, + "grad_norm": 3.216644287109375, + "learning_rate": 0.00015289254452115398, + "loss": 2.147085876464844, + "mean_token_accuracy": 0.869993035197258, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5250271247327327, + "eval_loss": 0.5184861421585083, + "eval_mean_token_accuracy": 0.8612929663062096, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.6309, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 2.07, + "step": 374 + }, + { + "entropy": 0.4808884654382263, + "epoch": 1.069611780455154, + "grad_norm": 2.666057586669922, + "learning_rate": 0.00016382243255158818, + "loss": 1.8543489074707031, + "mean_token_accuracy": 0.8768212256407497, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.45330303743481637, + "epoch": 1.2034805890227578, + "grad_norm": 2.1171557903289795, + "learning_rate": 0.00016364410560779942, + "loss": 1.7948956298828125, + "mean_token_accuracy": 0.8791207140684127, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.45347678795456886, + "epoch": 1.3373493975903614, + "grad_norm": 4.511318683624268, + "learning_rate": 0.00016328784000438723, + "loss": 1.7988812255859374, + "mean_token_accuracy": 0.8801145932078361, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.44952490359544756, + "epoch": 1.4712182061579653, + "grad_norm": 2.961987257003784, + "learning_rate": 0.0001627544114642431, + "loss": 1.7823495483398437, + "mean_token_accuracy": 0.8799195346236229, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4502506497502327, + "epoch": 1.605087014725569, + "grad_norm": 2.924865484237671, + "learning_rate": 0.000162044981459947, + "loss": 1.7603852844238281, + "mean_token_accuracy": 0.8811277949810028, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.44528300017118455, + "epoch": 1.7389558232931726, + "grad_norm": 2.928840160369873, + "learning_rate": 0.00016116109468480906, + "loss": 1.7513160705566406, + "mean_token_accuracy": 0.8816375133395195, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.43578719861805437, + "epoch": 1.8728246318607764, + "grad_norm": 15.26456356048584, + "learning_rate": 0.00016010467568949708, + "loss": 1.7112632751464845, + "mean_token_accuracy": 0.884103564620018, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4700849764049053, + "eval_loss": 0.49145790934562683, + "eval_mean_token_accuracy": 0.8681637060642242, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5704, + "eval_samples_per_second": 16.548, + "eval_steps_per_second": 2.071, + "step": 748 + }, + { + "entropy": 0.44389665202058926, + "epoch": 2.005354752342704, + "grad_norm": 3.126400947570801, + "learning_rate": 0.00015887802469157283, + "loss": 1.74362060546875, + "mean_token_accuracy": 0.882077858604566, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.36593644849956036, + "epoch": 2.139223560910308, + "grad_norm": 4.825572490692139, + "learning_rate": 0.000157483812567062, + "loss": 1.4261384582519532, + "mean_token_accuracy": 0.897953551709652, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3735161118209362, + "epoch": 2.2730923694779115, + "grad_norm": 2.341724395751953, + "learning_rate": 0.00015592507503496244, + "loss": 1.4566732788085937, + "mean_token_accuracy": 0.8954837635159493, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3712782260775566, + "epoch": 2.4069611780455156, + "grad_norm": 2.141064405441284, + "learning_rate": 0.00015420520604735334, + "loss": 1.4417454528808593, + "mean_token_accuracy": 0.8987472346425056, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.3738844521343708, + "epoch": 2.540829986613119, + "grad_norm": 2.904395818710327, + "learning_rate": 0.0001523279503994976, + "loss": 1.4441893005371094, + "mean_token_accuracy": 0.8981871575117111, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.3702578065544367, + "epoch": 2.674698795180723, + "grad_norm": 2.941880226135254, + "learning_rate": 0.00015029739557602818, + "loss": 1.4411444091796874, + "mean_token_accuracy": 0.8977779766917229, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.37640198186039925, + "epoch": 2.8085676037483265, + "grad_norm": 4.814720153808594, + "learning_rate": 0.00014811796285097166, + "loss": 1.463765869140625, + "mean_token_accuracy": 0.8968957820534706, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.3884583811461926, + "epoch": 2.9424364123159306, + "grad_norm": 2.023144483566284, + "learning_rate": 0.0001457943976609884, + "loss": 1.4860101318359376, + "mean_token_accuracy": 0.8945580047369003, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.41501702144742014, + "eval_loss": 0.5103150010108948, + "eval_mean_token_accuracy": 0.8657891270518303, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.5282, + "eval_samples_per_second": 16.555, + "eval_steps_per_second": 2.072, + "step": 1122 + }, + { + "entropy": 0.33624990103822766, + "epoch": 3.074966532797858, + "grad_norm": 2.8879244327545166, + "learning_rate": 0.0001433317592727896, + "loss": 1.2471446990966797, + "mean_token_accuracy": 0.9082047313150733, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.3088844185322523, + "epoch": 3.208835341365462, + "grad_norm": 2.8509788513183594, + "learning_rate": 0.00014073540976722957, + "loss": 1.1441875457763673, + "mean_token_accuracy": 0.9140481147170066, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.30224390886723995, + "epoch": 3.3427041499330654, + "grad_norm": 2.5208239555358887, + "learning_rate": 0.00013801100236405915, + "loss": 1.1275232696533204, + "mean_token_accuracy": 0.9146806076169014, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.295175199881196, + "epoch": 3.4765729585006695, + "grad_norm": 2.4297444820404053, + "learning_rate": 0.00013516446911276066, + "loss": 1.1239344787597656, + "mean_token_accuracy": 0.9151004731655121, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.295776079967618, + "epoch": 3.610441767068273, + "grad_norm": 2.3972175121307373, + "learning_rate": 0.00013220200797626748, + "loss": 1.148626480102539, + "mean_token_accuracy": 0.9141753858327866, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.2951671688258648, + "epoch": 3.7443105756358768, + "grad_norm": 2.1329967975616455, + "learning_rate": 0.00012913006933569033, + "loss": 1.1505547332763673, + "mean_token_accuracy": 0.9145446908473969, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.28995474845170977, + "epoch": 3.878179384203481, + "grad_norm": 2.111231803894043, + "learning_rate": 0.0001259553419454356, + "loss": 1.12584228515625, + "mean_token_accuracy": 0.9153258377313613, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34918407052755357, + "eval_loss": 0.5466129183769226, + "eval_mean_token_accuracy": 0.8677999797463417, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.4025, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.075, + "step": 1496 + }, + { + "entropy": 0.28912228466284395, + "epoch": 4.010709504685408, + "grad_norm": 2.6088380813598633, + "learning_rate": 0.00012268473836929623, + "loss": 1.1048170471191405, + "mean_token_accuracy": 0.9165902002291246, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.2037667266279459, + "epoch": 4.144578313253012, + "grad_norm": 2.7244584560394287, + "learning_rate": 0.00011932537992922588, + "loss": 0.7798351287841797, + "mean_token_accuracy": 0.9385521411895752, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.21155868768692015, + "epoch": 4.278447121820616, + "grad_norm": 17.15939712524414, + "learning_rate": 0.00011588458119956922, + "loss": 0.8124880981445313, + "mean_token_accuracy": 0.9354887393116951, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.21142005987465382, + "epoch": 4.412315930388219, + "grad_norm": 3.1366724967956543, + "learning_rate": 0.00011236983408050962, + "loss": 0.8087466430664062, + "mean_token_accuracy": 0.9360431012511253, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.21077097810804843, + "epoch": 4.546184738955823, + "grad_norm": 2.4884378910064697, + "learning_rate": 0.0001087887914854125, + "loss": 0.8054198455810547, + "mean_token_accuracy": 0.9361811754107475, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.21570609882473946, + "epoch": 4.680053547523427, + "grad_norm": 2.5278756618499756, + "learning_rate": 0.00010514925067758285, + "loss": 0.8254692077636718, + "mean_token_accuracy": 0.9351590833067894, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.21050533920526504, + "epoch": 4.813922356091031, + "grad_norm": 2.563352584838867, + "learning_rate": 0.00010145913629271953, + "loss": 0.8124603271484375, + "mean_token_accuracy": 0.9365199673175811, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.21181554518640042, + "epoch": 4.947791164658635, + "grad_norm": 2.7008941173553467, + "learning_rate": 9.772648308403213e-05, + "loss": 0.8135105895996094, + "mean_token_accuracy": 0.9371505591273308, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.2898014415055513, + "eval_loss": 0.6175746917724609, + "eval_mean_token_accuracy": 0.8674856871366501, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.3162, + "eval_samples_per_second": 16.591, + "eval_steps_per_second": 2.076, + "step": 1870 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.482704176402959e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9abe250820de6d55106ad056cc8dddd15cd6bd60 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05026173039334608, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eb431803690260ca3f6cba8114749bb3bb5a5531 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2244/trainer_state.json @@ -0,0 +1,540 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 2244, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2582822921872139, + "epoch": 0.13386880856760375, + "grad_norm": 5.033235549926758, + "learning_rate": 2.1466288485778066e-05, + "loss": 5.046328735351563, + "mean_token_accuracy": 0.7502484863996506, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5685664692521095, + "epoch": 0.2677376171352075, + "grad_norm": 3.554903030395508, + "learning_rate": 4.337066449167405e-05, + "loss": 2.209185791015625, + "mean_token_accuracy": 0.8581047981977463, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5092987871170044, + "epoch": 0.40160642570281124, + "grad_norm": 5.598084926605225, + "learning_rate": 6.527504049757005e-05, + "loss": 2.0146630859375, + "mean_token_accuracy": 0.8670356649160386, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4832092320919037, + "epoch": 0.535475234270415, + "grad_norm": 3.579439163208008, + "learning_rate": 8.717941650346603e-05, + "loss": 1.9066175842285156, + "mean_token_accuracy": 0.8735517236590385, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.49370287612080577, + "epoch": 0.6693440428380187, + "grad_norm": 22.771854400634766, + "learning_rate": 0.00010908379250936202, + "loss": 1.9317852783203124, + "mean_token_accuracy": 0.8717742815613747, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.475419160425663, + "epoch": 0.8032128514056225, + "grad_norm": 191.79444885253906, + "learning_rate": 0.000130988168515258, + "loss": 1.9097901916503905, + "mean_token_accuracy": 0.8744278407096863, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.5022796393930912, + "epoch": 0.9370816599732262, + "grad_norm": 3.216644287109375, + "learning_rate": 0.00015289254452115398, + "loss": 2.147085876464844, + "mean_token_accuracy": 0.869993035197258, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5250271247327327, + "eval_loss": 0.5184861421585083, + "eval_mean_token_accuracy": 0.8612929663062096, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.6309, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 2.07, + "step": 374 + }, + { + "entropy": 0.4808884654382263, + "epoch": 1.069611780455154, + "grad_norm": 2.666057586669922, + "learning_rate": 0.00016382243255158818, + "loss": 1.8543489074707031, + "mean_token_accuracy": 0.8768212256407497, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.45330303743481637, + "epoch": 1.2034805890227578, + "grad_norm": 2.1171557903289795, + "learning_rate": 0.00016364410560779942, + "loss": 1.7948956298828125, + "mean_token_accuracy": 0.8791207140684127, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.45347678795456886, + "epoch": 1.3373493975903614, + "grad_norm": 4.511318683624268, + "learning_rate": 0.00016328784000438723, + "loss": 1.7988812255859374, + "mean_token_accuracy": 0.8801145932078361, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.44952490359544756, + "epoch": 1.4712182061579653, + "grad_norm": 2.961987257003784, + "learning_rate": 0.0001627544114642431, + "loss": 1.7823495483398437, + "mean_token_accuracy": 0.8799195346236229, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4502506497502327, + "epoch": 1.605087014725569, + "grad_norm": 2.924865484237671, + "learning_rate": 0.000162044981459947, + "loss": 1.7603852844238281, + "mean_token_accuracy": 0.8811277949810028, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.44528300017118455, + "epoch": 1.7389558232931726, + "grad_norm": 2.928840160369873, + "learning_rate": 0.00016116109468480906, + "loss": 1.7513160705566406, + "mean_token_accuracy": 0.8816375133395195, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.43578719861805437, + "epoch": 1.8728246318607764, + "grad_norm": 15.26456356048584, + "learning_rate": 0.00016010467568949708, + "loss": 1.7112632751464845, + "mean_token_accuracy": 0.884103564620018, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4700849764049053, + "eval_loss": 0.49145790934562683, + "eval_mean_token_accuracy": 0.8681637060642242, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5704, + "eval_samples_per_second": 16.548, + "eval_steps_per_second": 2.071, + "step": 748 + }, + { + "entropy": 0.44389665202058926, + "epoch": 2.005354752342704, + "grad_norm": 3.126400947570801, + "learning_rate": 0.00015887802469157283, + "loss": 1.74362060546875, + "mean_token_accuracy": 0.882077858604566, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.36593644849956036, + "epoch": 2.139223560910308, + "grad_norm": 4.825572490692139, + "learning_rate": 0.000157483812567062, + "loss": 1.4261384582519532, + "mean_token_accuracy": 0.897953551709652, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3735161118209362, + "epoch": 2.2730923694779115, + "grad_norm": 2.341724395751953, + "learning_rate": 0.00015592507503496244, + "loss": 1.4566732788085937, + "mean_token_accuracy": 0.8954837635159493, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3712782260775566, + "epoch": 2.4069611780455156, + "grad_norm": 2.141064405441284, + "learning_rate": 0.00015420520604735334, + "loss": 1.4417454528808593, + "mean_token_accuracy": 0.8987472346425056, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.3738844521343708, + "epoch": 2.540829986613119, + "grad_norm": 2.904395818710327, + "learning_rate": 0.0001523279503994976, + "loss": 1.4441893005371094, + "mean_token_accuracy": 0.8981871575117111, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.3702578065544367, + "epoch": 2.674698795180723, + "grad_norm": 2.941880226135254, + "learning_rate": 0.00015029739557602818, + "loss": 1.4411444091796874, + "mean_token_accuracy": 0.8977779766917229, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.37640198186039925, + "epoch": 2.8085676037483265, + "grad_norm": 4.814720153808594, + "learning_rate": 0.00014811796285097166, + "loss": 1.463765869140625, + "mean_token_accuracy": 0.8968957820534706, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.3884583811461926, + "epoch": 2.9424364123159306, + "grad_norm": 2.023144483566284, + "learning_rate": 0.0001457943976609884, + "loss": 1.4860101318359376, + "mean_token_accuracy": 0.8945580047369003, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.41501702144742014, + "eval_loss": 0.5103150010108948, + "eval_mean_token_accuracy": 0.8657891270518303, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.5282, + "eval_samples_per_second": 16.555, + "eval_steps_per_second": 2.072, + "step": 1122 + }, + { + "entropy": 0.33624990103822766, + "epoch": 3.074966532797858, + "grad_norm": 2.8879244327545166, + "learning_rate": 0.0001433317592727896, + "loss": 1.2471446990966797, + "mean_token_accuracy": 0.9082047313150733, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.3088844185322523, + "epoch": 3.208835341365462, + "grad_norm": 2.8509788513183594, + "learning_rate": 0.00014073540976722957, + "loss": 1.1441875457763673, + "mean_token_accuracy": 0.9140481147170066, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.30224390886723995, + "epoch": 3.3427041499330654, + "grad_norm": 2.5208239555358887, + "learning_rate": 0.00013801100236405915, + "loss": 1.1275232696533204, + "mean_token_accuracy": 0.9146806076169014, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.295175199881196, + "epoch": 3.4765729585006695, + "grad_norm": 2.4297444820404053, + "learning_rate": 0.00013516446911276066, + "loss": 1.1239344787597656, + "mean_token_accuracy": 0.9151004731655121, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.295776079967618, + "epoch": 3.610441767068273, + "grad_norm": 2.3972175121307373, + "learning_rate": 0.00013220200797626748, + "loss": 1.148626480102539, + "mean_token_accuracy": 0.9141753858327866, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.2951671688258648, + "epoch": 3.7443105756358768, + "grad_norm": 2.1329967975616455, + "learning_rate": 0.00012913006933569033, + "loss": 1.1505547332763673, + "mean_token_accuracy": 0.9145446908473969, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.28995474845170977, + "epoch": 3.878179384203481, + "grad_norm": 2.111231803894043, + "learning_rate": 0.0001259553419454356, + "loss": 1.12584228515625, + "mean_token_accuracy": 0.9153258377313613, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34918407052755357, + "eval_loss": 0.5466129183769226, + "eval_mean_token_accuracy": 0.8677999797463417, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.4025, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.075, + "step": 1496 + }, + { + "entropy": 0.28912228466284395, + "epoch": 4.010709504685408, + "grad_norm": 2.6088380813598633, + "learning_rate": 0.00012268473836929623, + "loss": 1.1048170471191405, + "mean_token_accuracy": 0.9165902002291246, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.2037667266279459, + "epoch": 4.144578313253012, + "grad_norm": 2.7244584560394287, + "learning_rate": 0.00011932537992922588, + "loss": 0.7798351287841797, + "mean_token_accuracy": 0.9385521411895752, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.21155868768692015, + "epoch": 4.278447121820616, + "grad_norm": 17.15939712524414, + "learning_rate": 0.00011588458119956922, + "loss": 0.8124880981445313, + "mean_token_accuracy": 0.9354887393116951, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.21142005987465382, + "epoch": 4.412315930388219, + "grad_norm": 3.1366724967956543, + "learning_rate": 0.00011236983408050962, + "loss": 0.8087466430664062, + "mean_token_accuracy": 0.9360431012511253, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.21077097810804843, + "epoch": 4.546184738955823, + "grad_norm": 2.4884378910064697, + "learning_rate": 0.0001087887914854125, + "loss": 0.8054198455810547, + "mean_token_accuracy": 0.9361811754107475, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.21570609882473946, + "epoch": 4.680053547523427, + "grad_norm": 2.5278756618499756, + "learning_rate": 0.00010514925067758285, + "loss": 0.8254692077636718, + "mean_token_accuracy": 0.9351590833067894, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.21050533920526504, + "epoch": 4.813922356091031, + "grad_norm": 2.563352584838867, + "learning_rate": 0.00010145913629271953, + "loss": 0.8124603271484375, + "mean_token_accuracy": 0.9365199673175811, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.21181554518640042, + "epoch": 4.947791164658635, + "grad_norm": 2.7008941173553467, + "learning_rate": 9.772648308403213e-05, + "loss": 0.8135105895996094, + "mean_token_accuracy": 0.9371505591273308, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.2898014415055513, + "eval_loss": 0.6175746917724609, + "eval_mean_token_accuracy": 0.8674856871366501, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.3162, + "eval_samples_per_second": 16.591, + "eval_steps_per_second": 2.076, + "step": 1870 + }, + { + "entropy": 0.1729431924871122, + "epoch": 5.080321285140562, + "grad_norm": 1.9089794158935547, + "learning_rate": 9.395941842759104e-05, + "loss": 0.6498579406738281, + "mean_token_accuracy": 0.948695200561273, + "num_tokens": 4434412.0, + "step": 1900 + }, + { + "entropy": 0.14844272032380104, + "epoch": 5.214190093708166, + "grad_norm": 2.9947986602783203, + "learning_rate": 9.016614462600325e-05, + "loss": 0.5658287048339844, + "mean_token_accuracy": 0.9562490177154541, + "num_tokens": 4548703.0, + "step": 1950 + }, + { + "entropy": 0.15113764170557262, + "epoch": 5.34805890227577, + "grad_norm": 3.0459158420562744, + "learning_rate": 8.635492104894498e-05, + "loss": 0.569720458984375, + "mean_token_accuracy": 0.9561498582363128, + "num_tokens": 4665542.0, + "step": 2000 + }, + { + "entropy": 0.15329553466290236, + "epoch": 5.481927710843373, + "grad_norm": 2.5919315814971924, + "learning_rate": 8.253404614943809e-05, + "loss": 0.5799734878540039, + "mean_token_accuracy": 0.954962648153305, + "num_tokens": 4778075.0, + "step": 2050 + }, + { + "entropy": 0.1544624574482441, + "epoch": 5.615796519410977, + "grad_norm": 3.170863628387451, + "learning_rate": 7.871183939502759e-05, + "loss": 0.5769558715820312, + "mean_token_accuracy": 0.9549962303042412, + "num_tokens": 4897453.0, + "step": 2100 + }, + { + "entropy": 0.1543046496436, + "epoch": 5.749665327978581, + "grad_norm": 1.843237280845642, + "learning_rate": 7.489662315320254e-05, + "loss": 0.5841741561889648, + "mean_token_accuracy": 0.9532951918244362, + "num_tokens": 5012767.0, + "step": 2150 + }, + { + "entropy": 0.14787622597068548, + "epoch": 5.883534136546185, + "grad_norm": 2.890704393386841, + "learning_rate": 7.109670457050292e-05, + "loss": 0.5526316452026367, + "mean_token_accuracy": 0.9569103759527207, + "num_tokens": 5129878.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.2396429342031479, + "eval_loss": 0.7484959959983826, + "eval_mean_token_accuracy": 0.8640641874074936, + "eval_num_tokens": 5233482.0, + "eval_runtime": 96.6984, + "eval_samples_per_second": 16.526, + "eval_steps_per_second": 2.068, + "step": 2244 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7770396848717358e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9abe250820de6d55106ad056cc8dddd15cd6bd60 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05026173039334608, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..955d6570bde68e1a78726644c57f06a4aab2c7dd --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2618/trainer_state.json @@ -0,0 +1,631 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.0, + "eval_steps": 500, + "global_step": 2618, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2582822921872139, + "epoch": 0.13386880856760375, + "grad_norm": 5.033235549926758, + "learning_rate": 2.1466288485778066e-05, + "loss": 5.046328735351563, + "mean_token_accuracy": 0.7502484863996506, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5685664692521095, + "epoch": 0.2677376171352075, + "grad_norm": 3.554903030395508, + "learning_rate": 4.337066449167405e-05, + "loss": 2.209185791015625, + "mean_token_accuracy": 0.8581047981977463, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5092987871170044, + "epoch": 0.40160642570281124, + "grad_norm": 5.598084926605225, + "learning_rate": 6.527504049757005e-05, + "loss": 2.0146630859375, + "mean_token_accuracy": 0.8670356649160386, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4832092320919037, + "epoch": 0.535475234270415, + "grad_norm": 3.579439163208008, + "learning_rate": 8.717941650346603e-05, + "loss": 1.9066175842285156, + "mean_token_accuracy": 0.8735517236590385, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.49370287612080577, + "epoch": 0.6693440428380187, + "grad_norm": 22.771854400634766, + "learning_rate": 0.00010908379250936202, + "loss": 1.9317852783203124, + "mean_token_accuracy": 0.8717742815613747, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.475419160425663, + "epoch": 0.8032128514056225, + "grad_norm": 191.79444885253906, + "learning_rate": 0.000130988168515258, + "loss": 1.9097901916503905, + "mean_token_accuracy": 0.8744278407096863, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.5022796393930912, + "epoch": 0.9370816599732262, + "grad_norm": 3.216644287109375, + "learning_rate": 0.00015289254452115398, + "loss": 2.147085876464844, + "mean_token_accuracy": 0.869993035197258, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5250271247327327, + "eval_loss": 0.5184861421585083, + "eval_mean_token_accuracy": 0.8612929663062096, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.6309, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 2.07, + "step": 374 + }, + { + "entropy": 0.4808884654382263, + "epoch": 1.069611780455154, + "grad_norm": 2.666057586669922, + "learning_rate": 0.00016382243255158818, + "loss": 1.8543489074707031, + "mean_token_accuracy": 0.8768212256407497, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.45330303743481637, + "epoch": 1.2034805890227578, + "grad_norm": 2.1171557903289795, + "learning_rate": 0.00016364410560779942, + "loss": 1.7948956298828125, + "mean_token_accuracy": 0.8791207140684127, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.45347678795456886, + "epoch": 1.3373493975903614, + "grad_norm": 4.511318683624268, + "learning_rate": 0.00016328784000438723, + "loss": 1.7988812255859374, + "mean_token_accuracy": 0.8801145932078361, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.44952490359544756, + "epoch": 1.4712182061579653, + "grad_norm": 2.961987257003784, + "learning_rate": 0.0001627544114642431, + "loss": 1.7823495483398437, + "mean_token_accuracy": 0.8799195346236229, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4502506497502327, + "epoch": 1.605087014725569, + "grad_norm": 2.924865484237671, + "learning_rate": 0.000162044981459947, + "loss": 1.7603852844238281, + "mean_token_accuracy": 0.8811277949810028, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.44528300017118455, + "epoch": 1.7389558232931726, + "grad_norm": 2.928840160369873, + "learning_rate": 0.00016116109468480906, + "loss": 1.7513160705566406, + "mean_token_accuracy": 0.8816375133395195, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.43578719861805437, + "epoch": 1.8728246318607764, + "grad_norm": 15.26456356048584, + "learning_rate": 0.00016010467568949708, + "loss": 1.7112632751464845, + "mean_token_accuracy": 0.884103564620018, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4700849764049053, + "eval_loss": 0.49145790934562683, + "eval_mean_token_accuracy": 0.8681637060642242, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5704, + "eval_samples_per_second": 16.548, + "eval_steps_per_second": 2.071, + "step": 748 + }, + { + "entropy": 0.44389665202058926, + "epoch": 2.005354752342704, + "grad_norm": 3.126400947570801, + "learning_rate": 0.00015887802469157283, + "loss": 1.74362060546875, + "mean_token_accuracy": 0.882077858604566, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.36593644849956036, + "epoch": 2.139223560910308, + "grad_norm": 4.825572490692139, + "learning_rate": 0.000157483812567062, + "loss": 1.4261384582519532, + "mean_token_accuracy": 0.897953551709652, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3735161118209362, + "epoch": 2.2730923694779115, + "grad_norm": 2.341724395751953, + "learning_rate": 0.00015592507503496244, + "loss": 1.4566732788085937, + "mean_token_accuracy": 0.8954837635159493, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3712782260775566, + "epoch": 2.4069611780455156, + "grad_norm": 2.141064405441284, + "learning_rate": 0.00015420520604735334, + "loss": 1.4417454528808593, + "mean_token_accuracy": 0.8987472346425056, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.3738844521343708, + "epoch": 2.540829986613119, + "grad_norm": 2.904395818710327, + "learning_rate": 0.0001523279503994976, + "loss": 1.4441893005371094, + "mean_token_accuracy": 0.8981871575117111, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.3702578065544367, + "epoch": 2.674698795180723, + "grad_norm": 2.941880226135254, + "learning_rate": 0.00015029739557602818, + "loss": 1.4411444091796874, + "mean_token_accuracy": 0.8977779766917229, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.37640198186039925, + "epoch": 2.8085676037483265, + "grad_norm": 4.814720153808594, + "learning_rate": 0.00014811796285097166, + "loss": 1.463765869140625, + "mean_token_accuracy": 0.8968957820534706, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.3884583811461926, + "epoch": 2.9424364123159306, + "grad_norm": 2.023144483566284, + "learning_rate": 0.0001457943976609884, + "loss": 1.4860101318359376, + "mean_token_accuracy": 0.8945580047369003, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.41501702144742014, + "eval_loss": 0.5103150010108948, + "eval_mean_token_accuracy": 0.8657891270518303, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.5282, + "eval_samples_per_second": 16.555, + "eval_steps_per_second": 2.072, + "step": 1122 + }, + { + "entropy": 0.33624990103822766, + "epoch": 3.074966532797858, + "grad_norm": 2.8879244327545166, + "learning_rate": 0.0001433317592727896, + "loss": 1.2471446990966797, + "mean_token_accuracy": 0.9082047313150733, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.3088844185322523, + "epoch": 3.208835341365462, + "grad_norm": 2.8509788513183594, + "learning_rate": 0.00014073540976722957, + "loss": 1.1441875457763673, + "mean_token_accuracy": 0.9140481147170066, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.30224390886723995, + "epoch": 3.3427041499330654, + "grad_norm": 2.5208239555358887, + "learning_rate": 0.00013801100236405915, + "loss": 1.1275232696533204, + "mean_token_accuracy": 0.9146806076169014, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.295175199881196, + "epoch": 3.4765729585006695, + "grad_norm": 2.4297444820404053, + "learning_rate": 0.00013516446911276066, + "loss": 1.1239344787597656, + "mean_token_accuracy": 0.9151004731655121, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.295776079967618, + "epoch": 3.610441767068273, + "grad_norm": 2.3972175121307373, + "learning_rate": 0.00013220200797626748, + "loss": 1.148626480102539, + "mean_token_accuracy": 0.9141753858327866, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.2951671688258648, + "epoch": 3.7443105756358768, + "grad_norm": 2.1329967975616455, + "learning_rate": 0.00012913006933569033, + "loss": 1.1505547332763673, + "mean_token_accuracy": 0.9145446908473969, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.28995474845170977, + "epoch": 3.878179384203481, + "grad_norm": 2.111231803894043, + "learning_rate": 0.0001259553419454356, + "loss": 1.12584228515625, + "mean_token_accuracy": 0.9153258377313613, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34918407052755357, + "eval_loss": 0.5466129183769226, + "eval_mean_token_accuracy": 0.8677999797463417, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.4025, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.075, + "step": 1496 + }, + { + "entropy": 0.28912228466284395, + "epoch": 4.010709504685408, + "grad_norm": 2.6088380813598633, + "learning_rate": 0.00012268473836929623, + "loss": 1.1048170471191405, + "mean_token_accuracy": 0.9165902002291246, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.2037667266279459, + "epoch": 4.144578313253012, + "grad_norm": 2.7244584560394287, + "learning_rate": 0.00011932537992922588, + "loss": 0.7798351287841797, + "mean_token_accuracy": 0.9385521411895752, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.21155868768692015, + "epoch": 4.278447121820616, + "grad_norm": 17.15939712524414, + "learning_rate": 0.00011588458119956922, + "loss": 0.8124880981445313, + "mean_token_accuracy": 0.9354887393116951, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.21142005987465382, + "epoch": 4.412315930388219, + "grad_norm": 3.1366724967956543, + "learning_rate": 0.00011236983408050962, + "loss": 0.8087466430664062, + "mean_token_accuracy": 0.9360431012511253, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.21077097810804843, + "epoch": 4.546184738955823, + "grad_norm": 2.4884378910064697, + "learning_rate": 0.0001087887914854125, + "loss": 0.8054198455810547, + "mean_token_accuracy": 0.9361811754107475, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.21570609882473946, + "epoch": 4.680053547523427, + "grad_norm": 2.5278756618499756, + "learning_rate": 0.00010514925067758285, + "loss": 0.8254692077636718, + "mean_token_accuracy": 0.9351590833067894, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.21050533920526504, + "epoch": 4.813922356091031, + "grad_norm": 2.563352584838867, + "learning_rate": 0.00010145913629271953, + "loss": 0.8124603271484375, + "mean_token_accuracy": 0.9365199673175811, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.21181554518640042, + "epoch": 4.947791164658635, + "grad_norm": 2.7008941173553467, + "learning_rate": 9.772648308403213e-05, + "loss": 0.8135105895996094, + "mean_token_accuracy": 0.9371505591273308, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.2898014415055513, + "eval_loss": 0.6175746917724609, + "eval_mean_token_accuracy": 0.8674856871366501, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.3162, + "eval_samples_per_second": 16.591, + "eval_steps_per_second": 2.076, + "step": 1870 + }, + { + "entropy": 0.1729431924871122, + "epoch": 5.080321285140562, + "grad_norm": 1.9089794158935547, + "learning_rate": 9.395941842759104e-05, + "loss": 0.6498579406738281, + "mean_token_accuracy": 0.948695200561273, + "num_tokens": 4434412.0, + "step": 1900 + }, + { + "entropy": 0.14844272032380104, + "epoch": 5.214190093708166, + "grad_norm": 2.9947986602783203, + "learning_rate": 9.016614462600325e-05, + "loss": 0.5658287048339844, + "mean_token_accuracy": 0.9562490177154541, + "num_tokens": 4548703.0, + "step": 1950 + }, + { + "entropy": 0.15113764170557262, + "epoch": 5.34805890227577, + "grad_norm": 3.0459158420562744, + "learning_rate": 8.635492104894498e-05, + "loss": 0.569720458984375, + "mean_token_accuracy": 0.9561498582363128, + "num_tokens": 4665542.0, + "step": 2000 + }, + { + "entropy": 0.15329553466290236, + "epoch": 5.481927710843373, + "grad_norm": 2.5919315814971924, + "learning_rate": 8.253404614943809e-05, + "loss": 0.5799734878540039, + "mean_token_accuracy": 0.954962648153305, + "num_tokens": 4778075.0, + "step": 2050 + }, + { + "entropy": 0.1544624574482441, + "epoch": 5.615796519410977, + "grad_norm": 3.170863628387451, + "learning_rate": 7.871183939502759e-05, + "loss": 0.5769558715820312, + "mean_token_accuracy": 0.9549962303042412, + "num_tokens": 4897453.0, + "step": 2100 + }, + { + "entropy": 0.1543046496436, + "epoch": 5.749665327978581, + "grad_norm": 1.843237280845642, + "learning_rate": 7.489662315320254e-05, + "loss": 0.5841741561889648, + "mean_token_accuracy": 0.9532951918244362, + "num_tokens": 5012767.0, + "step": 2150 + }, + { + "entropy": 0.14787622597068548, + "epoch": 5.883534136546185, + "grad_norm": 2.890704393386841, + "learning_rate": 7.109670457050292e-05, + "loss": 0.5526316452026367, + "mean_token_accuracy": 0.9569103759527207, + "num_tokens": 5129878.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.2396429342031479, + "eval_loss": 0.7484959959983826, + "eval_mean_token_accuracy": 0.8640641874074936, + "eval_num_tokens": 5233482.0, + "eval_runtime": 96.6984, + "eval_samples_per_second": 16.526, + "eval_steps_per_second": 2.068, + "step": 2244 + }, + { + "entropy": 0.14854476036447467, + "epoch": 6.016064257028113, + "grad_norm": 2.5248372554779053, + "learning_rate": 6.732035748476789e-05, + "loss": 0.5454582977294922, + "mean_token_accuracy": 0.9571841708337417, + "num_tokens": 5246734.0, + "step": 2250 + }, + { + "entropy": 0.10946711044758559, + "epoch": 6.149933065595716, + "grad_norm": 2.492736577987671, + "learning_rate": 6.357580440990978e-05, + "loss": 0.4096903991699219, + "mean_token_accuracy": 0.96969064027071, + "num_tokens": 5366334.0, + "step": 2300 + }, + { + "entropy": 0.1087673882767558, + "epoch": 6.28380187416332, + "grad_norm": 2.550652027130127, + "learning_rate": 5.9871198632439174e-05, + "loss": 0.4065860748291016, + "mean_token_accuracy": 0.969154157936573, + "num_tokens": 5487013.0, + "step": 2350 + }, + { + "entropy": 0.11702938644215465, + "epoch": 6.417670682730924, + "grad_norm": 2.2913568019866943, + "learning_rate": 5.621460645872391e-05, + "loss": 0.4343274688720703, + "mean_token_accuracy": 0.9671943977475166, + "num_tokens": 5599415.0, + "step": 2400 + }, + { + "entropy": 0.11134784514084459, + "epoch": 6.551539491298527, + "grad_norm": 2.7473230361938477, + "learning_rate": 5.2613989651636254e-05, + "loss": 0.4231544876098633, + "mean_token_accuracy": 0.968756687939167, + "num_tokens": 5718099.0, + "step": 2450 + }, + { + "entropy": 0.11262517396360636, + "epoch": 6.685408299866131, + "grad_norm": 1.8821665048599243, + "learning_rate": 4.90771880948302e-05, + "loss": 0.42503364562988283, + "mean_token_accuracy": 0.968946928679943, + "num_tokens": 5833902.0, + "step": 2500 + }, + { + "entropy": 0.11436546228826046, + "epoch": 6.8192771084337345, + "grad_norm": 2.5898215770721436, + "learning_rate": 4.561190272239513e-05, + "loss": 0.4263697052001953, + "mean_token_accuracy": 0.967577712237835, + "num_tokens": 5948132.0, + "step": 2550 + }, + { + "entropy": 0.11348343381658196, + "epoch": 6.953145917001339, + "grad_norm": 1.4793730974197388, + "learning_rate": 4.222567875105448e-05, + "loss": 0.42732913970947267, + "mean_token_accuracy": 0.9683026453852653, + "num_tokens": 6066224.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.20438951179385184, + "eval_loss": 0.8442238569259644, + "eval_mean_token_accuracy": 0.8623786172270775, + "eval_num_tokens": 6105729.0, + "eval_runtime": 96.5733, + "eval_samples_per_second": 16.547, + "eval_steps_per_second": 2.071, + "step": 2618 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0754369719077043e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9abe250820de6d55106ad056cc8dddd15cd6bd60 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05026173039334608, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0b2adeb85f853dc74ab13629bfaa8db613d396d9 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2992/trainer_state.json @@ -0,0 +1,712 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 2992, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2582822921872139, + "epoch": 0.13386880856760375, + "grad_norm": 5.033235549926758, + "learning_rate": 2.1466288485778066e-05, + "loss": 5.046328735351563, + "mean_token_accuracy": 0.7502484863996506, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5685664692521095, + "epoch": 0.2677376171352075, + "grad_norm": 3.554903030395508, + "learning_rate": 4.337066449167405e-05, + "loss": 2.209185791015625, + "mean_token_accuracy": 0.8581047981977463, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5092987871170044, + "epoch": 0.40160642570281124, + "grad_norm": 5.598084926605225, + "learning_rate": 6.527504049757005e-05, + "loss": 2.0146630859375, + "mean_token_accuracy": 0.8670356649160386, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4832092320919037, + "epoch": 0.535475234270415, + "grad_norm": 3.579439163208008, + "learning_rate": 8.717941650346603e-05, + "loss": 1.9066175842285156, + "mean_token_accuracy": 0.8735517236590385, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.49370287612080577, + "epoch": 0.6693440428380187, + "grad_norm": 22.771854400634766, + "learning_rate": 0.00010908379250936202, + "loss": 1.9317852783203124, + "mean_token_accuracy": 0.8717742815613747, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.475419160425663, + "epoch": 0.8032128514056225, + "grad_norm": 191.79444885253906, + "learning_rate": 0.000130988168515258, + "loss": 1.9097901916503905, + "mean_token_accuracy": 0.8744278407096863, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.5022796393930912, + "epoch": 0.9370816599732262, + "grad_norm": 3.216644287109375, + "learning_rate": 0.00015289254452115398, + "loss": 2.147085876464844, + "mean_token_accuracy": 0.869993035197258, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5250271247327327, + "eval_loss": 0.5184861421585083, + "eval_mean_token_accuracy": 0.8612929663062096, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.6309, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 2.07, + "step": 374 + }, + { + "entropy": 0.4808884654382263, + "epoch": 1.069611780455154, + "grad_norm": 2.666057586669922, + "learning_rate": 0.00016382243255158818, + "loss": 1.8543489074707031, + "mean_token_accuracy": 0.8768212256407497, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.45330303743481637, + "epoch": 1.2034805890227578, + "grad_norm": 2.1171557903289795, + "learning_rate": 0.00016364410560779942, + "loss": 1.7948956298828125, + "mean_token_accuracy": 0.8791207140684127, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.45347678795456886, + "epoch": 1.3373493975903614, + "grad_norm": 4.511318683624268, + "learning_rate": 0.00016328784000438723, + "loss": 1.7988812255859374, + "mean_token_accuracy": 0.8801145932078361, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.44952490359544756, + "epoch": 1.4712182061579653, + "grad_norm": 2.961987257003784, + "learning_rate": 0.0001627544114642431, + "loss": 1.7823495483398437, + "mean_token_accuracy": 0.8799195346236229, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4502506497502327, + "epoch": 1.605087014725569, + "grad_norm": 2.924865484237671, + "learning_rate": 0.000162044981459947, + "loss": 1.7603852844238281, + "mean_token_accuracy": 0.8811277949810028, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.44528300017118455, + "epoch": 1.7389558232931726, + "grad_norm": 2.928840160369873, + "learning_rate": 0.00016116109468480906, + "loss": 1.7513160705566406, + "mean_token_accuracy": 0.8816375133395195, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.43578719861805437, + "epoch": 1.8728246318607764, + "grad_norm": 15.26456356048584, + "learning_rate": 0.00016010467568949708, + "loss": 1.7112632751464845, + "mean_token_accuracy": 0.884103564620018, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4700849764049053, + "eval_loss": 0.49145790934562683, + "eval_mean_token_accuracy": 0.8681637060642242, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5704, + "eval_samples_per_second": 16.548, + "eval_steps_per_second": 2.071, + "step": 748 + }, + { + "entropy": 0.44389665202058926, + "epoch": 2.005354752342704, + "grad_norm": 3.126400947570801, + "learning_rate": 0.00015887802469157283, + "loss": 1.74362060546875, + "mean_token_accuracy": 0.882077858604566, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.36593644849956036, + "epoch": 2.139223560910308, + "grad_norm": 4.825572490692139, + "learning_rate": 0.000157483812567062, + "loss": 1.4261384582519532, + "mean_token_accuracy": 0.897953551709652, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3735161118209362, + "epoch": 2.2730923694779115, + "grad_norm": 2.341724395751953, + "learning_rate": 0.00015592507503496244, + "loss": 1.4566732788085937, + "mean_token_accuracy": 0.8954837635159493, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3712782260775566, + "epoch": 2.4069611780455156, + "grad_norm": 2.141064405441284, + "learning_rate": 0.00015420520604735334, + "loss": 1.4417454528808593, + "mean_token_accuracy": 0.8987472346425056, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.3738844521343708, + "epoch": 2.540829986613119, + "grad_norm": 2.904395818710327, + "learning_rate": 0.0001523279503994976, + "loss": 1.4441893005371094, + "mean_token_accuracy": 0.8981871575117111, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.3702578065544367, + "epoch": 2.674698795180723, + "grad_norm": 2.941880226135254, + "learning_rate": 0.00015029739557602818, + "loss": 1.4411444091796874, + "mean_token_accuracy": 0.8977779766917229, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.37640198186039925, + "epoch": 2.8085676037483265, + "grad_norm": 4.814720153808594, + "learning_rate": 0.00014811796285097166, + "loss": 1.463765869140625, + "mean_token_accuracy": 0.8968957820534706, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.3884583811461926, + "epoch": 2.9424364123159306, + "grad_norm": 2.023144483566284, + "learning_rate": 0.0001457943976609884, + "loss": 1.4860101318359376, + "mean_token_accuracy": 0.8945580047369003, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.41501702144742014, + "eval_loss": 0.5103150010108948, + "eval_mean_token_accuracy": 0.8657891270518303, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.5282, + "eval_samples_per_second": 16.555, + "eval_steps_per_second": 2.072, + "step": 1122 + }, + { + "entropy": 0.33624990103822766, + "epoch": 3.074966532797858, + "grad_norm": 2.8879244327545166, + "learning_rate": 0.0001433317592727896, + "loss": 1.2471446990966797, + "mean_token_accuracy": 0.9082047313150733, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.3088844185322523, + "epoch": 3.208835341365462, + "grad_norm": 2.8509788513183594, + "learning_rate": 0.00014073540976722957, + "loss": 1.1441875457763673, + "mean_token_accuracy": 0.9140481147170066, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.30224390886723995, + "epoch": 3.3427041499330654, + "grad_norm": 2.5208239555358887, + "learning_rate": 0.00013801100236405915, + "loss": 1.1275232696533204, + "mean_token_accuracy": 0.9146806076169014, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.295175199881196, + "epoch": 3.4765729585006695, + "grad_norm": 2.4297444820404053, + "learning_rate": 0.00013516446911276066, + "loss": 1.1239344787597656, + "mean_token_accuracy": 0.9151004731655121, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.295776079967618, + "epoch": 3.610441767068273, + "grad_norm": 2.3972175121307373, + "learning_rate": 0.00013220200797626748, + "loss": 1.148626480102539, + "mean_token_accuracy": 0.9141753858327866, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.2951671688258648, + "epoch": 3.7443105756358768, + "grad_norm": 2.1329967975616455, + "learning_rate": 0.00012913006933569033, + "loss": 1.1505547332763673, + "mean_token_accuracy": 0.9145446908473969, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.28995474845170977, + "epoch": 3.878179384203481, + "grad_norm": 2.111231803894043, + "learning_rate": 0.0001259553419454356, + "loss": 1.12584228515625, + "mean_token_accuracy": 0.9153258377313613, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34918407052755357, + "eval_loss": 0.5466129183769226, + "eval_mean_token_accuracy": 0.8677999797463417, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.4025, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.075, + "step": 1496 + }, + { + "entropy": 0.28912228466284395, + "epoch": 4.010709504685408, + "grad_norm": 2.6088380813598633, + "learning_rate": 0.00012268473836929623, + "loss": 1.1048170471191405, + "mean_token_accuracy": 0.9165902002291246, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.2037667266279459, + "epoch": 4.144578313253012, + "grad_norm": 2.7244584560394287, + "learning_rate": 0.00011932537992922588, + "loss": 0.7798351287841797, + "mean_token_accuracy": 0.9385521411895752, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.21155868768692015, + "epoch": 4.278447121820616, + "grad_norm": 17.15939712524414, + "learning_rate": 0.00011588458119956922, + "loss": 0.8124880981445313, + "mean_token_accuracy": 0.9354887393116951, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.21142005987465382, + "epoch": 4.412315930388219, + "grad_norm": 3.1366724967956543, + "learning_rate": 0.00011236983408050962, + "loss": 0.8087466430664062, + "mean_token_accuracy": 0.9360431012511253, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.21077097810804843, + "epoch": 4.546184738955823, + "grad_norm": 2.4884378910064697, + "learning_rate": 0.0001087887914854125, + "loss": 0.8054198455810547, + "mean_token_accuracy": 0.9361811754107475, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.21570609882473946, + "epoch": 4.680053547523427, + "grad_norm": 2.5278756618499756, + "learning_rate": 0.00010514925067758285, + "loss": 0.8254692077636718, + "mean_token_accuracy": 0.9351590833067894, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.21050533920526504, + "epoch": 4.813922356091031, + "grad_norm": 2.563352584838867, + "learning_rate": 0.00010145913629271953, + "loss": 0.8124603271484375, + "mean_token_accuracy": 0.9365199673175811, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.21181554518640042, + "epoch": 4.947791164658635, + "grad_norm": 2.7008941173553467, + "learning_rate": 9.772648308403213e-05, + "loss": 0.8135105895996094, + "mean_token_accuracy": 0.9371505591273308, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.2898014415055513, + "eval_loss": 0.6175746917724609, + "eval_mean_token_accuracy": 0.8674856871366501, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.3162, + "eval_samples_per_second": 16.591, + "eval_steps_per_second": 2.076, + "step": 1870 + }, + { + "entropy": 0.1729431924871122, + "epoch": 5.080321285140562, + "grad_norm": 1.9089794158935547, + "learning_rate": 9.395941842759104e-05, + "loss": 0.6498579406738281, + "mean_token_accuracy": 0.948695200561273, + "num_tokens": 4434412.0, + "step": 1900 + }, + { + "entropy": 0.14844272032380104, + "epoch": 5.214190093708166, + "grad_norm": 2.9947986602783203, + "learning_rate": 9.016614462600325e-05, + "loss": 0.5658287048339844, + "mean_token_accuracy": 0.9562490177154541, + "num_tokens": 4548703.0, + "step": 1950 + }, + { + "entropy": 0.15113764170557262, + "epoch": 5.34805890227577, + "grad_norm": 3.0459158420562744, + "learning_rate": 8.635492104894498e-05, + "loss": 0.569720458984375, + "mean_token_accuracy": 0.9561498582363128, + "num_tokens": 4665542.0, + "step": 2000 + }, + { + "entropy": 0.15329553466290236, + "epoch": 5.481927710843373, + "grad_norm": 2.5919315814971924, + "learning_rate": 8.253404614943809e-05, + "loss": 0.5799734878540039, + "mean_token_accuracy": 0.954962648153305, + "num_tokens": 4778075.0, + "step": 2050 + }, + { + "entropy": 0.1544624574482441, + "epoch": 5.615796519410977, + "grad_norm": 3.170863628387451, + "learning_rate": 7.871183939502759e-05, + "loss": 0.5769558715820312, + "mean_token_accuracy": 0.9549962303042412, + "num_tokens": 4897453.0, + "step": 2100 + }, + { + "entropy": 0.1543046496436, + "epoch": 5.749665327978581, + "grad_norm": 1.843237280845642, + "learning_rate": 7.489662315320254e-05, + "loss": 0.5841741561889648, + "mean_token_accuracy": 0.9532951918244362, + "num_tokens": 5012767.0, + "step": 2150 + }, + { + "entropy": 0.14787622597068548, + "epoch": 5.883534136546185, + "grad_norm": 2.890704393386841, + "learning_rate": 7.109670457050292e-05, + "loss": 0.5526316452026367, + "mean_token_accuracy": 0.9569103759527207, + "num_tokens": 5129878.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.2396429342031479, + "eval_loss": 0.7484959959983826, + "eval_mean_token_accuracy": 0.8640641874074936, + "eval_num_tokens": 5233482.0, + "eval_runtime": 96.6984, + "eval_samples_per_second": 16.526, + "eval_steps_per_second": 2.068, + "step": 2244 + }, + { + "entropy": 0.14854476036447467, + "epoch": 6.016064257028113, + "grad_norm": 2.5248372554779053, + "learning_rate": 6.732035748476789e-05, + "loss": 0.5454582977294922, + "mean_token_accuracy": 0.9571841708337417, + "num_tokens": 5246734.0, + "step": 2250 + }, + { + "entropy": 0.10946711044758559, + "epoch": 6.149933065595716, + "grad_norm": 2.492736577987671, + "learning_rate": 6.357580440990978e-05, + "loss": 0.4096903991699219, + "mean_token_accuracy": 0.96969064027071, + "num_tokens": 5366334.0, + "step": 2300 + }, + { + "entropy": 0.1087673882767558, + "epoch": 6.28380187416332, + "grad_norm": 2.550652027130127, + "learning_rate": 5.9871198632439174e-05, + "loss": 0.4065860748291016, + "mean_token_accuracy": 0.969154157936573, + "num_tokens": 5487013.0, + "step": 2350 + }, + { + "entropy": 0.11702938644215465, + "epoch": 6.417670682730924, + "grad_norm": 2.2913568019866943, + "learning_rate": 5.621460645872391e-05, + "loss": 0.4343274688720703, + "mean_token_accuracy": 0.9671943977475166, + "num_tokens": 5599415.0, + "step": 2400 + }, + { + "entropy": 0.11134784514084459, + "epoch": 6.551539491298527, + "grad_norm": 2.7473230361938477, + "learning_rate": 5.2613989651636254e-05, + "loss": 0.4231544876098633, + "mean_token_accuracy": 0.968756687939167, + "num_tokens": 5718099.0, + "step": 2450 + }, + { + "entropy": 0.11262517396360636, + "epoch": 6.685408299866131, + "grad_norm": 1.8821665048599243, + "learning_rate": 4.90771880948302e-05, + "loss": 0.42503364562988283, + "mean_token_accuracy": 0.968946928679943, + "num_tokens": 5833902.0, + "step": 2500 + }, + { + "entropy": 0.11436546228826046, + "epoch": 6.8192771084337345, + "grad_norm": 2.5898215770721436, + "learning_rate": 4.561190272239513e-05, + "loss": 0.4263697052001953, + "mean_token_accuracy": 0.967577712237835, + "num_tokens": 5948132.0, + "step": 2550 + }, + { + "entropy": 0.11348343381658196, + "epoch": 6.953145917001339, + "grad_norm": 1.4793730974197388, + "learning_rate": 4.222567875105448e-05, + "loss": 0.42732913970947267, + "mean_token_accuracy": 0.9683026453852653, + "num_tokens": 6066224.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.20438951179385184, + "eval_loss": 0.8442238569259644, + "eval_mean_token_accuracy": 0.8623786172270775, + "eval_num_tokens": 6105729.0, + "eval_runtime": 96.5733, + "eval_samples_per_second": 16.547, + "eval_steps_per_second": 2.071, + "step": 2618 + }, + { + "entropy": 0.10301848094571721, + "epoch": 7.085676037483267, + "grad_norm": 1.7398229837417603, + "learning_rate": 3.8925889251419277e-05, + "loss": 0.3753490447998047, + "mean_token_accuracy": 0.9721216511244726, + "num_tokens": 6183429.0, + "step": 2650 + }, + { + "entropy": 0.09001849109306931, + "epoch": 7.21954484605087, + "grad_norm": 1.2295695543289185, + "learning_rate": 3.571971909406742e-05, + "loss": 0.3318290710449219, + "mean_token_accuracy": 0.9757600504159928, + "num_tokens": 6303432.0, + "step": 2700 + }, + { + "entropy": 0.09881723931059241, + "epoch": 7.353413654618474, + "grad_norm": 1.2764956951141357, + "learning_rate": 3.2614149305404984e-05, + "loss": 0.3623368453979492, + "mean_token_accuracy": 0.973207780122757, + "num_tokens": 6414176.0, + "step": 2750 + }, + { + "entropy": 0.09360236193984747, + "epoch": 7.4872824631860775, + "grad_norm": 1.6123838424682617, + "learning_rate": 2.961594186737198e-05, + "loss": 0.34752960205078126, + "mean_token_accuracy": 0.9750900790095329, + "num_tokens": 6531772.0, + "step": 2800 + }, + { + "entropy": 0.08683731975033879, + "epoch": 7.621151271753681, + "grad_norm": 2.689685821533203, + "learning_rate": 2.6731624994089548e-05, + "loss": 0.3220005798339844, + "mean_token_accuracy": 0.9765287268161774, + "num_tokens": 6655745.0, + "step": 2850 + }, + { + "entropy": 0.09308060238137841, + "epoch": 7.755020080321285, + "grad_norm": 2.0739898681640625, + "learning_rate": 2.3967478917506556e-05, + "loss": 0.34147651672363283, + "mean_token_accuracy": 0.975026119351387, + "num_tokens": 6772303.0, + "step": 2900 + }, + { + "entropy": 0.09029730424284935, + "epoch": 7.888888888888889, + "grad_norm": 1.9244085550308228, + "learning_rate": 2.1329522212996067e-05, + "loss": 0.3376229476928711, + "mean_token_accuracy": 0.9750253957509994, + "num_tokens": 6887254.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.16788041561841965, + "eval_loss": 0.9710925817489624, + "eval_mean_token_accuracy": 0.8621352380514145, + "eval_num_tokens": 6977976.0, + "eval_runtime": 96.6722, + "eval_samples_per_second": 16.53, + "eval_steps_per_second": 2.069, + "step": 2992 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.373756578602813e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9abe250820de6d55106ad056cc8dddd15cd6bd60 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05026173039334608, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3cc7f56edafe22843e326c0f56ad2bca98e1d47d --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3366/trainer_state.json @@ -0,0 +1,803 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.0, + "eval_steps": 500, + "global_step": 3366, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2582822921872139, + "epoch": 0.13386880856760375, + "grad_norm": 5.033235549926758, + "learning_rate": 2.1466288485778066e-05, + "loss": 5.046328735351563, + "mean_token_accuracy": 0.7502484863996506, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5685664692521095, + "epoch": 0.2677376171352075, + "grad_norm": 3.554903030395508, + "learning_rate": 4.337066449167405e-05, + "loss": 2.209185791015625, + "mean_token_accuracy": 0.8581047981977463, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5092987871170044, + "epoch": 0.40160642570281124, + "grad_norm": 5.598084926605225, + "learning_rate": 6.527504049757005e-05, + "loss": 2.0146630859375, + "mean_token_accuracy": 0.8670356649160386, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4832092320919037, + "epoch": 0.535475234270415, + "grad_norm": 3.579439163208008, + "learning_rate": 8.717941650346603e-05, + "loss": 1.9066175842285156, + "mean_token_accuracy": 0.8735517236590385, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.49370287612080577, + "epoch": 0.6693440428380187, + "grad_norm": 22.771854400634766, + "learning_rate": 0.00010908379250936202, + "loss": 1.9317852783203124, + "mean_token_accuracy": 0.8717742815613747, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.475419160425663, + "epoch": 0.8032128514056225, + "grad_norm": 191.79444885253906, + "learning_rate": 0.000130988168515258, + "loss": 1.9097901916503905, + "mean_token_accuracy": 0.8744278407096863, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.5022796393930912, + "epoch": 0.9370816599732262, + "grad_norm": 3.216644287109375, + "learning_rate": 0.00015289254452115398, + "loss": 2.147085876464844, + "mean_token_accuracy": 0.869993035197258, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5250271247327327, + "eval_loss": 0.5184861421585083, + "eval_mean_token_accuracy": 0.8612929663062096, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.6309, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 2.07, + "step": 374 + }, + { + "entropy": 0.4808884654382263, + "epoch": 1.069611780455154, + "grad_norm": 2.666057586669922, + "learning_rate": 0.00016382243255158818, + "loss": 1.8543489074707031, + "mean_token_accuracy": 0.8768212256407497, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.45330303743481637, + "epoch": 1.2034805890227578, + "grad_norm": 2.1171557903289795, + "learning_rate": 0.00016364410560779942, + "loss": 1.7948956298828125, + "mean_token_accuracy": 0.8791207140684127, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.45347678795456886, + "epoch": 1.3373493975903614, + "grad_norm": 4.511318683624268, + "learning_rate": 0.00016328784000438723, + "loss": 1.7988812255859374, + "mean_token_accuracy": 0.8801145932078361, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.44952490359544756, + "epoch": 1.4712182061579653, + "grad_norm": 2.961987257003784, + "learning_rate": 0.0001627544114642431, + "loss": 1.7823495483398437, + "mean_token_accuracy": 0.8799195346236229, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4502506497502327, + "epoch": 1.605087014725569, + "grad_norm": 2.924865484237671, + "learning_rate": 0.000162044981459947, + "loss": 1.7603852844238281, + "mean_token_accuracy": 0.8811277949810028, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.44528300017118455, + "epoch": 1.7389558232931726, + "grad_norm": 2.928840160369873, + "learning_rate": 0.00016116109468480906, + "loss": 1.7513160705566406, + "mean_token_accuracy": 0.8816375133395195, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.43578719861805437, + "epoch": 1.8728246318607764, + "grad_norm": 15.26456356048584, + "learning_rate": 0.00016010467568949708, + "loss": 1.7112632751464845, + "mean_token_accuracy": 0.884103564620018, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4700849764049053, + "eval_loss": 0.49145790934562683, + "eval_mean_token_accuracy": 0.8681637060642242, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5704, + "eval_samples_per_second": 16.548, + "eval_steps_per_second": 2.071, + "step": 748 + }, + { + "entropy": 0.44389665202058926, + "epoch": 2.005354752342704, + "grad_norm": 3.126400947570801, + "learning_rate": 0.00015887802469157283, + "loss": 1.74362060546875, + "mean_token_accuracy": 0.882077858604566, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.36593644849956036, + "epoch": 2.139223560910308, + "grad_norm": 4.825572490692139, + "learning_rate": 0.000157483812567062, + "loss": 1.4261384582519532, + "mean_token_accuracy": 0.897953551709652, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3735161118209362, + "epoch": 2.2730923694779115, + "grad_norm": 2.341724395751953, + "learning_rate": 0.00015592507503496244, + "loss": 1.4566732788085937, + "mean_token_accuracy": 0.8954837635159493, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3712782260775566, + "epoch": 2.4069611780455156, + "grad_norm": 2.141064405441284, + "learning_rate": 0.00015420520604735334, + "loss": 1.4417454528808593, + "mean_token_accuracy": 0.8987472346425056, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.3738844521343708, + "epoch": 2.540829986613119, + "grad_norm": 2.904395818710327, + "learning_rate": 0.0001523279503994976, + "loss": 1.4441893005371094, + "mean_token_accuracy": 0.8981871575117111, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.3702578065544367, + "epoch": 2.674698795180723, + "grad_norm": 2.941880226135254, + "learning_rate": 0.00015029739557602818, + "loss": 1.4411444091796874, + "mean_token_accuracy": 0.8977779766917229, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.37640198186039925, + "epoch": 2.8085676037483265, + "grad_norm": 4.814720153808594, + "learning_rate": 0.00014811796285097166, + "loss": 1.463765869140625, + "mean_token_accuracy": 0.8968957820534706, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.3884583811461926, + "epoch": 2.9424364123159306, + "grad_norm": 2.023144483566284, + "learning_rate": 0.0001457943976609884, + "loss": 1.4860101318359376, + "mean_token_accuracy": 0.8945580047369003, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.41501702144742014, + "eval_loss": 0.5103150010108948, + "eval_mean_token_accuracy": 0.8657891270518303, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.5282, + "eval_samples_per_second": 16.555, + "eval_steps_per_second": 2.072, + "step": 1122 + }, + { + "entropy": 0.33624990103822766, + "epoch": 3.074966532797858, + "grad_norm": 2.8879244327545166, + "learning_rate": 0.0001433317592727896, + "loss": 1.2471446990966797, + "mean_token_accuracy": 0.9082047313150733, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.3088844185322523, + "epoch": 3.208835341365462, + "grad_norm": 2.8509788513183594, + "learning_rate": 0.00014073540976722957, + "loss": 1.1441875457763673, + "mean_token_accuracy": 0.9140481147170066, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.30224390886723995, + "epoch": 3.3427041499330654, + "grad_norm": 2.5208239555358887, + "learning_rate": 0.00013801100236405915, + "loss": 1.1275232696533204, + "mean_token_accuracy": 0.9146806076169014, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.295175199881196, + "epoch": 3.4765729585006695, + "grad_norm": 2.4297444820404053, + "learning_rate": 0.00013516446911276066, + "loss": 1.1239344787597656, + "mean_token_accuracy": 0.9151004731655121, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.295776079967618, + "epoch": 3.610441767068273, + "grad_norm": 2.3972175121307373, + "learning_rate": 0.00013220200797626748, + "loss": 1.148626480102539, + "mean_token_accuracy": 0.9141753858327866, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.2951671688258648, + "epoch": 3.7443105756358768, + "grad_norm": 2.1329967975616455, + "learning_rate": 0.00012913006933569033, + "loss": 1.1505547332763673, + "mean_token_accuracy": 0.9145446908473969, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.28995474845170977, + "epoch": 3.878179384203481, + "grad_norm": 2.111231803894043, + "learning_rate": 0.0001259553419454356, + "loss": 1.12584228515625, + "mean_token_accuracy": 0.9153258377313613, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34918407052755357, + "eval_loss": 0.5466129183769226, + "eval_mean_token_accuracy": 0.8677999797463417, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.4025, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.075, + "step": 1496 + }, + { + "entropy": 0.28912228466284395, + "epoch": 4.010709504685408, + "grad_norm": 2.6088380813598633, + "learning_rate": 0.00012268473836929623, + "loss": 1.1048170471191405, + "mean_token_accuracy": 0.9165902002291246, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.2037667266279459, + "epoch": 4.144578313253012, + "grad_norm": 2.7244584560394287, + "learning_rate": 0.00011932537992922588, + "loss": 0.7798351287841797, + "mean_token_accuracy": 0.9385521411895752, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.21155868768692015, + "epoch": 4.278447121820616, + "grad_norm": 17.15939712524414, + "learning_rate": 0.00011588458119956922, + "loss": 0.8124880981445313, + "mean_token_accuracy": 0.9354887393116951, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.21142005987465382, + "epoch": 4.412315930388219, + "grad_norm": 3.1366724967956543, + "learning_rate": 0.00011236983408050962, + "loss": 0.8087466430664062, + "mean_token_accuracy": 0.9360431012511253, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.21077097810804843, + "epoch": 4.546184738955823, + "grad_norm": 2.4884378910064697, + "learning_rate": 0.0001087887914854125, + "loss": 0.8054198455810547, + "mean_token_accuracy": 0.9361811754107475, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.21570609882473946, + "epoch": 4.680053547523427, + "grad_norm": 2.5278756618499756, + "learning_rate": 0.00010514925067758285, + "loss": 0.8254692077636718, + "mean_token_accuracy": 0.9351590833067894, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.21050533920526504, + "epoch": 4.813922356091031, + "grad_norm": 2.563352584838867, + "learning_rate": 0.00010145913629271953, + "loss": 0.8124603271484375, + "mean_token_accuracy": 0.9365199673175811, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.21181554518640042, + "epoch": 4.947791164658635, + "grad_norm": 2.7008941173553467, + "learning_rate": 9.772648308403213e-05, + "loss": 0.8135105895996094, + "mean_token_accuracy": 0.9371505591273308, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.2898014415055513, + "eval_loss": 0.6175746917724609, + "eval_mean_token_accuracy": 0.8674856871366501, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.3162, + "eval_samples_per_second": 16.591, + "eval_steps_per_second": 2.076, + "step": 1870 + }, + { + "entropy": 0.1729431924871122, + "epoch": 5.080321285140562, + "grad_norm": 1.9089794158935547, + "learning_rate": 9.395941842759104e-05, + "loss": 0.6498579406738281, + "mean_token_accuracy": 0.948695200561273, + "num_tokens": 4434412.0, + "step": 1900 + }, + { + "entropy": 0.14844272032380104, + "epoch": 5.214190093708166, + "grad_norm": 2.9947986602783203, + "learning_rate": 9.016614462600325e-05, + "loss": 0.5658287048339844, + "mean_token_accuracy": 0.9562490177154541, + "num_tokens": 4548703.0, + "step": 1950 + }, + { + "entropy": 0.15113764170557262, + "epoch": 5.34805890227577, + "grad_norm": 3.0459158420562744, + "learning_rate": 8.635492104894498e-05, + "loss": 0.569720458984375, + "mean_token_accuracy": 0.9561498582363128, + "num_tokens": 4665542.0, + "step": 2000 + }, + { + "entropy": 0.15329553466290236, + "epoch": 5.481927710843373, + "grad_norm": 2.5919315814971924, + "learning_rate": 8.253404614943809e-05, + "loss": 0.5799734878540039, + "mean_token_accuracy": 0.954962648153305, + "num_tokens": 4778075.0, + "step": 2050 + }, + { + "entropy": 0.1544624574482441, + "epoch": 5.615796519410977, + "grad_norm": 3.170863628387451, + "learning_rate": 7.871183939502759e-05, + "loss": 0.5769558715820312, + "mean_token_accuracy": 0.9549962303042412, + "num_tokens": 4897453.0, + "step": 2100 + }, + { + "entropy": 0.1543046496436, + "epoch": 5.749665327978581, + "grad_norm": 1.843237280845642, + "learning_rate": 7.489662315320254e-05, + "loss": 0.5841741561889648, + "mean_token_accuracy": 0.9532951918244362, + "num_tokens": 5012767.0, + "step": 2150 + }, + { + "entropy": 0.14787622597068548, + "epoch": 5.883534136546185, + "grad_norm": 2.890704393386841, + "learning_rate": 7.109670457050292e-05, + "loss": 0.5526316452026367, + "mean_token_accuracy": 0.9569103759527207, + "num_tokens": 5129878.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.2396429342031479, + "eval_loss": 0.7484959959983826, + "eval_mean_token_accuracy": 0.8640641874074936, + "eval_num_tokens": 5233482.0, + "eval_runtime": 96.6984, + "eval_samples_per_second": 16.526, + "eval_steps_per_second": 2.068, + "step": 2244 + }, + { + "entropy": 0.14854476036447467, + "epoch": 6.016064257028113, + "grad_norm": 2.5248372554779053, + "learning_rate": 6.732035748476789e-05, + "loss": 0.5454582977294922, + "mean_token_accuracy": 0.9571841708337417, + "num_tokens": 5246734.0, + "step": 2250 + }, + { + "entropy": 0.10946711044758559, + "epoch": 6.149933065595716, + "grad_norm": 2.492736577987671, + "learning_rate": 6.357580440990978e-05, + "loss": 0.4096903991699219, + "mean_token_accuracy": 0.96969064027071, + "num_tokens": 5366334.0, + "step": 2300 + }, + { + "entropy": 0.1087673882767558, + "epoch": 6.28380187416332, + "grad_norm": 2.550652027130127, + "learning_rate": 5.9871198632439174e-05, + "loss": 0.4065860748291016, + "mean_token_accuracy": 0.969154157936573, + "num_tokens": 5487013.0, + "step": 2350 + }, + { + "entropy": 0.11702938644215465, + "epoch": 6.417670682730924, + "grad_norm": 2.2913568019866943, + "learning_rate": 5.621460645872391e-05, + "loss": 0.4343274688720703, + "mean_token_accuracy": 0.9671943977475166, + "num_tokens": 5599415.0, + "step": 2400 + }, + { + "entropy": 0.11134784514084459, + "epoch": 6.551539491298527, + "grad_norm": 2.7473230361938477, + "learning_rate": 5.2613989651636254e-05, + "loss": 0.4231544876098633, + "mean_token_accuracy": 0.968756687939167, + "num_tokens": 5718099.0, + "step": 2450 + }, + { + "entropy": 0.11262517396360636, + "epoch": 6.685408299866131, + "grad_norm": 1.8821665048599243, + "learning_rate": 4.90771880948302e-05, + "loss": 0.42503364562988283, + "mean_token_accuracy": 0.968946928679943, + "num_tokens": 5833902.0, + "step": 2500 + }, + { + "entropy": 0.11436546228826046, + "epoch": 6.8192771084337345, + "grad_norm": 2.5898215770721436, + "learning_rate": 4.561190272239513e-05, + "loss": 0.4263697052001953, + "mean_token_accuracy": 0.967577712237835, + "num_tokens": 5948132.0, + "step": 2550 + }, + { + "entropy": 0.11348343381658196, + "epoch": 6.953145917001339, + "grad_norm": 1.4793730974197388, + "learning_rate": 4.222567875105448e-05, + "loss": 0.42732913970947267, + "mean_token_accuracy": 0.9683026453852653, + "num_tokens": 6066224.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.20438951179385184, + "eval_loss": 0.8442238569259644, + "eval_mean_token_accuracy": 0.8623786172270775, + "eval_num_tokens": 6105729.0, + "eval_runtime": 96.5733, + "eval_samples_per_second": 16.547, + "eval_steps_per_second": 2.071, + "step": 2618 + }, + { + "entropy": 0.10301848094571721, + "epoch": 7.085676037483267, + "grad_norm": 1.7398229837417603, + "learning_rate": 3.8925889251419277e-05, + "loss": 0.3753490447998047, + "mean_token_accuracy": 0.9721216511244726, + "num_tokens": 6183429.0, + "step": 2650 + }, + { + "entropy": 0.09001849109306931, + "epoch": 7.21954484605087, + "grad_norm": 1.2295695543289185, + "learning_rate": 3.571971909406742e-05, + "loss": 0.3318290710449219, + "mean_token_accuracy": 0.9757600504159928, + "num_tokens": 6303432.0, + "step": 2700 + }, + { + "entropy": 0.09881723931059241, + "epoch": 7.353413654618474, + "grad_norm": 1.2764956951141357, + "learning_rate": 3.2614149305404984e-05, + "loss": 0.3623368453979492, + "mean_token_accuracy": 0.973207780122757, + "num_tokens": 6414176.0, + "step": 2750 + }, + { + "entropy": 0.09360236193984747, + "epoch": 7.4872824631860775, + "grad_norm": 1.6123838424682617, + "learning_rate": 2.961594186737198e-05, + "loss": 0.34752960205078126, + "mean_token_accuracy": 0.9750900790095329, + "num_tokens": 6531772.0, + "step": 2800 + }, + { + "entropy": 0.08683731975033879, + "epoch": 7.621151271753681, + "grad_norm": 2.689685821533203, + "learning_rate": 2.6731624994089548e-05, + "loss": 0.3220005798339844, + "mean_token_accuracy": 0.9765287268161774, + "num_tokens": 6655745.0, + "step": 2850 + }, + { + "entropy": 0.09308060238137841, + "epoch": 7.755020080321285, + "grad_norm": 2.0739898681640625, + "learning_rate": 2.3967478917506556e-05, + "loss": 0.34147651672363283, + "mean_token_accuracy": 0.975026119351387, + "num_tokens": 6772303.0, + "step": 2900 + }, + { + "entropy": 0.09029730424284935, + "epoch": 7.888888888888889, + "grad_norm": 1.9244085550308228, + "learning_rate": 2.1329522212996067e-05, + "loss": 0.3376229476928711, + "mean_token_accuracy": 0.9750253957509994, + "num_tokens": 6887254.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.16788041561841965, + "eval_loss": 0.9710925817489624, + "eval_mean_token_accuracy": 0.8621352380514145, + "eval_num_tokens": 6977976.0, + "eval_runtime": 96.6722, + "eval_samples_per_second": 16.53, + "eval_steps_per_second": 2.069, + "step": 2992 + }, + { + "entropy": 0.09257136479095378, + "epoch": 8.021419009370817, + "grad_norm": 0.6420087218284607, + "learning_rate": 1.882349869467544e-05, + "loss": 0.3422011566162109, + "mean_token_accuracy": 0.9748761277608197, + "num_tokens": 6997584.0, + "step": 3000 + }, + { + "entropy": 0.08579577693715691, + "epoch": 8.15528781793842, + "grad_norm": 0.6230213046073914, + "learning_rate": 1.6454864908983872e-05, + "loss": 0.3201043319702148, + "mean_token_accuracy": 0.9772940769791603, + "num_tokens": 7109661.0, + "step": 3050 + }, + { + "entropy": 0.08380989912897348, + "epoch": 8.289156626506024, + "grad_norm": 0.7393160462379456, + "learning_rate": 1.4228778253748889e-05, + "loss": 0.31029510498046875, + "mean_token_accuracy": 0.9774167820811271, + "num_tokens": 7228951.0, + "step": 3100 + }, + { + "entropy": 0.08082627209834754, + "epoch": 8.423025435073628, + "grad_norm": 1.839464783668518, + "learning_rate": 1.2150085748610697e-05, + "loss": 0.3011078643798828, + "mean_token_accuracy": 0.9782456710934639, + "num_tokens": 7345974.0, + "step": 3150 + }, + { + "entropy": 0.0822253195475787, + "epoch": 8.556894243641231, + "grad_norm": 1.0962920188903809, + "learning_rate": 1.0223313481255313e-05, + "loss": 0.3033403205871582, + "mean_token_accuracy": 0.9777043810486794, + "num_tokens": 7465280.0, + "step": 3200 + }, + { + "entropy": 0.08191775260493159, + "epoch": 8.690763052208835, + "grad_norm": 0.9543969035148621, + "learning_rate": 8.452656752436198e-06, + "loss": 0.30089645385742186, + "mean_token_accuracy": 0.9785924726724624, + "num_tokens": 7584144.0, + "step": 3250 + }, + { + "entropy": 0.08602304834872485, + "epoch": 8.824631860776439, + "grad_norm": 1.2917355298995972, + "learning_rate": 6.841970941242257e-06, + "loss": 0.3139409637451172, + "mean_token_accuracy": 0.9771846759319306, + "num_tokens": 7698337.0, + "step": 3300 + }, + { + "entropy": 0.08643955274485052, + "epoch": 8.958500669344042, + "grad_norm": 0.7915636301040649, + "learning_rate": 5.394763110501694e-06, + "loss": 0.32275047302246096, + "mean_token_accuracy": 0.9766862055659294, + "num_tokens": 7815006.0, + "step": 3350 + }, + { + "epoch": 9.0, + "eval_entropy": 0.1494569706916809, + "eval_loss": 1.0805635452270508, + "eval_mean_token_accuracy": 0.8623243406414985, + "eval_num_tokens": 7850223.0, + "eval_runtime": 96.1431, + "eval_samples_per_second": 16.621, + "eval_steps_per_second": 2.08, + "step": 3366 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6694552867277025e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9abe250820de6d55106ad056cc8dddd15cd6bd60 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05026173039334608, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e820916e2fc5a587d0fdc7d5166b85de94a2eb7f --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-374/trainer_state.json @@ -0,0 +1,115 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 374, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2582822921872139, + "epoch": 0.13386880856760375, + "grad_norm": 5.033235549926758, + "learning_rate": 2.1466288485778066e-05, + "loss": 5.046328735351563, + "mean_token_accuracy": 0.7502484863996506, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5685664692521095, + "epoch": 0.2677376171352075, + "grad_norm": 3.554903030395508, + "learning_rate": 4.337066449167405e-05, + "loss": 2.209185791015625, + "mean_token_accuracy": 0.8581047981977463, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5092987871170044, + "epoch": 0.40160642570281124, + "grad_norm": 5.598084926605225, + "learning_rate": 6.527504049757005e-05, + "loss": 2.0146630859375, + "mean_token_accuracy": 0.8670356649160386, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4832092320919037, + "epoch": 0.535475234270415, + "grad_norm": 3.579439163208008, + "learning_rate": 8.717941650346603e-05, + "loss": 1.9066175842285156, + "mean_token_accuracy": 0.8735517236590385, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.49370287612080577, + "epoch": 0.6693440428380187, + "grad_norm": 22.771854400634766, + "learning_rate": 0.00010908379250936202, + "loss": 1.9317852783203124, + "mean_token_accuracy": 0.8717742815613747, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.475419160425663, + "epoch": 0.8032128514056225, + "grad_norm": 191.79444885253906, + "learning_rate": 0.000130988168515258, + "loss": 1.9097901916503905, + "mean_token_accuracy": 0.8744278407096863, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.5022796393930912, + "epoch": 0.9370816599732262, + "grad_norm": 3.216644287109375, + "learning_rate": 0.00015289254452115398, + "loss": 2.147085876464844, + "mean_token_accuracy": 0.869993035197258, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5250271247327327, + "eval_loss": 0.5184861421585083, + "eval_mean_token_accuracy": 0.8612929663062096, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.6309, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 2.07, + "step": 374 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.947242714769866e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9abe250820de6d55106ad056cc8dddd15cd6bd60 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05026173039334608, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d652737f2a37d1cab0bc904cf9277bdf9e4bc7ff --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3740/trainer_state.json @@ -0,0 +1,884 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 3740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2582822921872139, + "epoch": 0.13386880856760375, + "grad_norm": 5.033235549926758, + "learning_rate": 2.1466288485778066e-05, + "loss": 5.046328735351563, + "mean_token_accuracy": 0.7502484863996506, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5685664692521095, + "epoch": 0.2677376171352075, + "grad_norm": 3.554903030395508, + "learning_rate": 4.337066449167405e-05, + "loss": 2.209185791015625, + "mean_token_accuracy": 0.8581047981977463, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5092987871170044, + "epoch": 0.40160642570281124, + "grad_norm": 5.598084926605225, + "learning_rate": 6.527504049757005e-05, + "loss": 2.0146630859375, + "mean_token_accuracy": 0.8670356649160386, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4832092320919037, + "epoch": 0.535475234270415, + "grad_norm": 3.579439163208008, + "learning_rate": 8.717941650346603e-05, + "loss": 1.9066175842285156, + "mean_token_accuracy": 0.8735517236590385, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.49370287612080577, + "epoch": 0.6693440428380187, + "grad_norm": 22.771854400634766, + "learning_rate": 0.00010908379250936202, + "loss": 1.9317852783203124, + "mean_token_accuracy": 0.8717742815613747, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.475419160425663, + "epoch": 0.8032128514056225, + "grad_norm": 191.79444885253906, + "learning_rate": 0.000130988168515258, + "loss": 1.9097901916503905, + "mean_token_accuracy": 0.8744278407096863, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.5022796393930912, + "epoch": 0.9370816599732262, + "grad_norm": 3.216644287109375, + "learning_rate": 0.00015289254452115398, + "loss": 2.147085876464844, + "mean_token_accuracy": 0.869993035197258, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5250271247327327, + "eval_loss": 0.5184861421585083, + "eval_mean_token_accuracy": 0.8612929663062096, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.6309, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 2.07, + "step": 374 + }, + { + "entropy": 0.4808884654382263, + "epoch": 1.069611780455154, + "grad_norm": 2.666057586669922, + "learning_rate": 0.00016382243255158818, + "loss": 1.8543489074707031, + "mean_token_accuracy": 0.8768212256407497, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.45330303743481637, + "epoch": 1.2034805890227578, + "grad_norm": 2.1171557903289795, + "learning_rate": 0.00016364410560779942, + "loss": 1.7948956298828125, + "mean_token_accuracy": 0.8791207140684127, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.45347678795456886, + "epoch": 1.3373493975903614, + "grad_norm": 4.511318683624268, + "learning_rate": 0.00016328784000438723, + "loss": 1.7988812255859374, + "mean_token_accuracy": 0.8801145932078361, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.44952490359544756, + "epoch": 1.4712182061579653, + "grad_norm": 2.961987257003784, + "learning_rate": 0.0001627544114642431, + "loss": 1.7823495483398437, + "mean_token_accuracy": 0.8799195346236229, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4502506497502327, + "epoch": 1.605087014725569, + "grad_norm": 2.924865484237671, + "learning_rate": 0.000162044981459947, + "loss": 1.7603852844238281, + "mean_token_accuracy": 0.8811277949810028, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.44528300017118455, + "epoch": 1.7389558232931726, + "grad_norm": 2.928840160369873, + "learning_rate": 0.00016116109468480906, + "loss": 1.7513160705566406, + "mean_token_accuracy": 0.8816375133395195, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.43578719861805437, + "epoch": 1.8728246318607764, + "grad_norm": 15.26456356048584, + "learning_rate": 0.00016010467568949708, + "loss": 1.7112632751464845, + "mean_token_accuracy": 0.884103564620018, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4700849764049053, + "eval_loss": 0.49145790934562683, + "eval_mean_token_accuracy": 0.8681637060642242, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5704, + "eval_samples_per_second": 16.548, + "eval_steps_per_second": 2.071, + "step": 748 + }, + { + "entropy": 0.44389665202058926, + "epoch": 2.005354752342704, + "grad_norm": 3.126400947570801, + "learning_rate": 0.00015887802469157283, + "loss": 1.74362060546875, + "mean_token_accuracy": 0.882077858604566, + "num_tokens": 1749755.0, + "step": 750 + }, + { + "entropy": 0.36593644849956036, + "epoch": 2.139223560910308, + "grad_norm": 4.825572490692139, + "learning_rate": 0.000157483812567062, + "loss": 1.4261384582519532, + "mean_token_accuracy": 0.897953551709652, + "num_tokens": 1868231.0, + "step": 800 + }, + { + "entropy": 0.3735161118209362, + "epoch": 2.2730923694779115, + "grad_norm": 2.341724395751953, + "learning_rate": 0.00015592507503496244, + "loss": 1.4566732788085937, + "mean_token_accuracy": 0.8954837635159493, + "num_tokens": 1979116.0, + "step": 850 + }, + { + "entropy": 0.3712782260775566, + "epoch": 2.4069611780455156, + "grad_norm": 2.141064405441284, + "learning_rate": 0.00015420520604735334, + "loss": 1.4417454528808593, + "mean_token_accuracy": 0.8987472346425056, + "num_tokens": 2094539.0, + "step": 900 + }, + { + "entropy": 0.3738844521343708, + "epoch": 2.540829986613119, + "grad_norm": 2.904395818710327, + "learning_rate": 0.0001523279503994976, + "loss": 1.4441893005371094, + "mean_token_accuracy": 0.8981871575117111, + "num_tokens": 2209415.0, + "step": 950 + }, + { + "entropy": 0.3702578065544367, + "epoch": 2.674698795180723, + "grad_norm": 2.941880226135254, + "learning_rate": 0.00015029739557602818, + "loss": 1.4411444091796874, + "mean_token_accuracy": 0.8977779766917229, + "num_tokens": 2324269.0, + "step": 1000 + }, + { + "entropy": 0.37640198186039925, + "epoch": 2.8085676037483265, + "grad_norm": 4.814720153808594, + "learning_rate": 0.00014811796285097166, + "loss": 1.463765869140625, + "mean_token_accuracy": 0.8968957820534706, + "num_tokens": 2447336.0, + "step": 1050 + }, + { + "entropy": 0.3884583811461926, + "epoch": 2.9424364123159306, + "grad_norm": 2.023144483566284, + "learning_rate": 0.0001457943976609884, + "loss": 1.4860101318359376, + "mean_token_accuracy": 0.8945580047369003, + "num_tokens": 2565837.0, + "step": 1100 + }, + { + "epoch": 3.0, + "eval_entropy": 0.41501702144742014, + "eval_loss": 0.5103150010108948, + "eval_mean_token_accuracy": 0.8657891270518303, + "eval_num_tokens": 2616741.0, + "eval_runtime": 96.5282, + "eval_samples_per_second": 16.555, + "eval_steps_per_second": 2.072, + "step": 1122 + }, + { + "entropy": 0.33624990103822766, + "epoch": 3.074966532797858, + "grad_norm": 2.8879244327545166, + "learning_rate": 0.0001433317592727896, + "loss": 1.2471446990966797, + "mean_token_accuracy": 0.9082047313150733, + "num_tokens": 2685975.0, + "step": 1150 + }, + { + "entropy": 0.3088844185322523, + "epoch": 3.208835341365462, + "grad_norm": 2.8509788513183594, + "learning_rate": 0.00014073540976722957, + "loss": 1.1441875457763673, + "mean_token_accuracy": 0.9140481147170066, + "num_tokens": 2798277.0, + "step": 1200 + }, + { + "entropy": 0.30224390886723995, + "epoch": 3.3427041499330654, + "grad_norm": 2.5208239555358887, + "learning_rate": 0.00013801100236405915, + "loss": 1.1275232696533204, + "mean_token_accuracy": 0.9146806076169014, + "num_tokens": 2918973.0, + "step": 1250 + }, + { + "entropy": 0.295175199881196, + "epoch": 3.4765729585006695, + "grad_norm": 2.4297444820404053, + "learning_rate": 0.00013516446911276066, + "loss": 1.1239344787597656, + "mean_token_accuracy": 0.9151004731655121, + "num_tokens": 3039073.0, + "step": 1300 + }, + { + "entropy": 0.295776079967618, + "epoch": 3.610441767068273, + "grad_norm": 2.3972175121307373, + "learning_rate": 0.00013220200797626748, + "loss": 1.148626480102539, + "mean_token_accuracy": 0.9141753858327866, + "num_tokens": 3153710.0, + "step": 1350 + }, + { + "entropy": 0.2951671688258648, + "epoch": 3.7443105756358768, + "grad_norm": 2.1329967975616455, + "learning_rate": 0.00012913006933569033, + "loss": 1.1505547332763673, + "mean_token_accuracy": 0.9145446908473969, + "num_tokens": 3263594.0, + "step": 1400 + }, + { + "entropy": 0.28995474845170977, + "epoch": 3.878179384203481, + "grad_norm": 2.111231803894043, + "learning_rate": 0.0001259553419454356, + "loss": 1.12584228515625, + "mean_token_accuracy": 0.9153258377313613, + "num_tokens": 3386033.0, + "step": 1450 + }, + { + "epoch": 4.0, + "eval_entropy": 0.34918407052755357, + "eval_loss": 0.5466129183769226, + "eval_mean_token_accuracy": 0.8677999797463417, + "eval_num_tokens": 3488988.0, + "eval_runtime": 96.4025, + "eval_samples_per_second": 16.576, + "eval_steps_per_second": 2.075, + "step": 1496 + }, + { + "entropy": 0.28912228466284395, + "epoch": 4.010709504685408, + "grad_norm": 2.6088380813598633, + "learning_rate": 0.00012268473836929623, + "loss": 1.1048170471191405, + "mean_token_accuracy": 0.9165902002291246, + "num_tokens": 3498406.0, + "step": 1500 + }, + { + "entropy": 0.2037667266279459, + "epoch": 4.144578313253012, + "grad_norm": 2.7244584560394287, + "learning_rate": 0.00011932537992922588, + "loss": 0.7798351287841797, + "mean_token_accuracy": 0.9385521411895752, + "num_tokens": 3614301.0, + "step": 1550 + }, + { + "entropy": 0.21155868768692015, + "epoch": 4.278447121820616, + "grad_norm": 17.15939712524414, + "learning_rate": 0.00011588458119956922, + "loss": 0.8124880981445313, + "mean_token_accuracy": 0.9354887393116951, + "num_tokens": 3735705.0, + "step": 1600 + }, + { + "entropy": 0.21142005987465382, + "epoch": 4.412315930388219, + "grad_norm": 3.1366724967956543, + "learning_rate": 0.00011236983408050962, + "loss": 0.8087466430664062, + "mean_token_accuracy": 0.9360431012511253, + "num_tokens": 3854287.0, + "step": 1650 + }, + { + "entropy": 0.21077097810804843, + "epoch": 4.546184738955823, + "grad_norm": 2.4884378910064697, + "learning_rate": 0.0001087887914854125, + "loss": 0.8054198455810547, + "mean_token_accuracy": 0.9361811754107475, + "num_tokens": 3967362.0, + "step": 1700 + }, + { + "entropy": 0.21570609882473946, + "epoch": 4.680053547523427, + "grad_norm": 2.5278756618499756, + "learning_rate": 0.00010514925067758285, + "loss": 0.8254692077636718, + "mean_token_accuracy": 0.9351590833067894, + "num_tokens": 4081441.0, + "step": 1750 + }, + { + "entropy": 0.21050533920526504, + "epoch": 4.813922356091031, + "grad_norm": 2.563352584838867, + "learning_rate": 0.00010145913629271953, + "loss": 0.8124603271484375, + "mean_token_accuracy": 0.9365199673175811, + "num_tokens": 4197604.0, + "step": 1800 + }, + { + "entropy": 0.21181554518640042, + "epoch": 4.947791164658635, + "grad_norm": 2.7008941173553467, + "learning_rate": 9.772648308403213e-05, + "loss": 0.8135105895996094, + "mean_token_accuracy": 0.9371505591273308, + "num_tokens": 4318894.0, + "step": 1850 + }, + { + "epoch": 5.0, + "eval_entropy": 0.2898014415055513, + "eval_loss": 0.6175746917724609, + "eval_mean_token_accuracy": 0.8674856871366501, + "eval_num_tokens": 4361235.0, + "eval_runtime": 96.3162, + "eval_samples_per_second": 16.591, + "eval_steps_per_second": 2.076, + "step": 1870 + }, + { + "entropy": 0.1729431924871122, + "epoch": 5.080321285140562, + "grad_norm": 1.9089794158935547, + "learning_rate": 9.395941842759104e-05, + "loss": 0.6498579406738281, + "mean_token_accuracy": 0.948695200561273, + "num_tokens": 4434412.0, + "step": 1900 + }, + { + "entropy": 0.14844272032380104, + "epoch": 5.214190093708166, + "grad_norm": 2.9947986602783203, + "learning_rate": 9.016614462600325e-05, + "loss": 0.5658287048339844, + "mean_token_accuracy": 0.9562490177154541, + "num_tokens": 4548703.0, + "step": 1950 + }, + { + "entropy": 0.15113764170557262, + "epoch": 5.34805890227577, + "grad_norm": 3.0459158420562744, + "learning_rate": 8.635492104894498e-05, + "loss": 0.569720458984375, + "mean_token_accuracy": 0.9561498582363128, + "num_tokens": 4665542.0, + "step": 2000 + }, + { + "entropy": 0.15329553466290236, + "epoch": 5.481927710843373, + "grad_norm": 2.5919315814971924, + "learning_rate": 8.253404614943809e-05, + "loss": 0.5799734878540039, + "mean_token_accuracy": 0.954962648153305, + "num_tokens": 4778075.0, + "step": 2050 + }, + { + "entropy": 0.1544624574482441, + "epoch": 5.615796519410977, + "grad_norm": 3.170863628387451, + "learning_rate": 7.871183939502759e-05, + "loss": 0.5769558715820312, + "mean_token_accuracy": 0.9549962303042412, + "num_tokens": 4897453.0, + "step": 2100 + }, + { + "entropy": 0.1543046496436, + "epoch": 5.749665327978581, + "grad_norm": 1.843237280845642, + "learning_rate": 7.489662315320254e-05, + "loss": 0.5841741561889648, + "mean_token_accuracy": 0.9532951918244362, + "num_tokens": 5012767.0, + "step": 2150 + }, + { + "entropy": 0.14787622597068548, + "epoch": 5.883534136546185, + "grad_norm": 2.890704393386841, + "learning_rate": 7.109670457050292e-05, + "loss": 0.5526316452026367, + "mean_token_accuracy": 0.9569103759527207, + "num_tokens": 5129878.0, + "step": 2200 + }, + { + "epoch": 6.0, + "eval_entropy": 0.2396429342031479, + "eval_loss": 0.7484959959983826, + "eval_mean_token_accuracy": 0.8640641874074936, + "eval_num_tokens": 5233482.0, + "eval_runtime": 96.6984, + "eval_samples_per_second": 16.526, + "eval_steps_per_second": 2.068, + "step": 2244 + }, + { + "entropy": 0.14854476036447467, + "epoch": 6.016064257028113, + "grad_norm": 2.5248372554779053, + "learning_rate": 6.732035748476789e-05, + "loss": 0.5454582977294922, + "mean_token_accuracy": 0.9571841708337417, + "num_tokens": 5246734.0, + "step": 2250 + }, + { + "entropy": 0.10946711044758559, + "epoch": 6.149933065595716, + "grad_norm": 2.492736577987671, + "learning_rate": 6.357580440990978e-05, + "loss": 0.4096903991699219, + "mean_token_accuracy": 0.96969064027071, + "num_tokens": 5366334.0, + "step": 2300 + }, + { + "entropy": 0.1087673882767558, + "epoch": 6.28380187416332, + "grad_norm": 2.550652027130127, + "learning_rate": 5.9871198632439174e-05, + "loss": 0.4065860748291016, + "mean_token_accuracy": 0.969154157936573, + "num_tokens": 5487013.0, + "step": 2350 + }, + { + "entropy": 0.11702938644215465, + "epoch": 6.417670682730924, + "grad_norm": 2.2913568019866943, + "learning_rate": 5.621460645872391e-05, + "loss": 0.4343274688720703, + "mean_token_accuracy": 0.9671943977475166, + "num_tokens": 5599415.0, + "step": 2400 + }, + { + "entropy": 0.11134784514084459, + "epoch": 6.551539491298527, + "grad_norm": 2.7473230361938477, + "learning_rate": 5.2613989651636254e-05, + "loss": 0.4231544876098633, + "mean_token_accuracy": 0.968756687939167, + "num_tokens": 5718099.0, + "step": 2450 + }, + { + "entropy": 0.11262517396360636, + "epoch": 6.685408299866131, + "grad_norm": 1.8821665048599243, + "learning_rate": 4.90771880948302e-05, + "loss": 0.42503364562988283, + "mean_token_accuracy": 0.968946928679943, + "num_tokens": 5833902.0, + "step": 2500 + }, + { + "entropy": 0.11436546228826046, + "epoch": 6.8192771084337345, + "grad_norm": 2.5898215770721436, + "learning_rate": 4.561190272239513e-05, + "loss": 0.4263697052001953, + "mean_token_accuracy": 0.967577712237835, + "num_tokens": 5948132.0, + "step": 2550 + }, + { + "entropy": 0.11348343381658196, + "epoch": 6.953145917001339, + "grad_norm": 1.4793730974197388, + "learning_rate": 4.222567875105448e-05, + "loss": 0.42732913970947267, + "mean_token_accuracy": 0.9683026453852653, + "num_tokens": 6066224.0, + "step": 2600 + }, + { + "epoch": 7.0, + "eval_entropy": 0.20438951179385184, + "eval_loss": 0.8442238569259644, + "eval_mean_token_accuracy": 0.8623786172270775, + "eval_num_tokens": 6105729.0, + "eval_runtime": 96.5733, + "eval_samples_per_second": 16.547, + "eval_steps_per_second": 2.071, + "step": 2618 + }, + { + "entropy": 0.10301848094571721, + "epoch": 7.085676037483267, + "grad_norm": 1.7398229837417603, + "learning_rate": 3.8925889251419277e-05, + "loss": 0.3753490447998047, + "mean_token_accuracy": 0.9721216511244726, + "num_tokens": 6183429.0, + "step": 2650 + }, + { + "entropy": 0.09001849109306931, + "epoch": 7.21954484605087, + "grad_norm": 1.2295695543289185, + "learning_rate": 3.571971909406742e-05, + "loss": 0.3318290710449219, + "mean_token_accuracy": 0.9757600504159928, + "num_tokens": 6303432.0, + "step": 2700 + }, + { + "entropy": 0.09881723931059241, + "epoch": 7.353413654618474, + "grad_norm": 1.2764956951141357, + "learning_rate": 3.2614149305404984e-05, + "loss": 0.3623368453979492, + "mean_token_accuracy": 0.973207780122757, + "num_tokens": 6414176.0, + "step": 2750 + }, + { + "entropy": 0.09360236193984747, + "epoch": 7.4872824631860775, + "grad_norm": 1.6123838424682617, + "learning_rate": 2.961594186737198e-05, + "loss": 0.34752960205078126, + "mean_token_accuracy": 0.9750900790095329, + "num_tokens": 6531772.0, + "step": 2800 + }, + { + "entropy": 0.08683731975033879, + "epoch": 7.621151271753681, + "grad_norm": 2.689685821533203, + "learning_rate": 2.6731624994089548e-05, + "loss": 0.3220005798339844, + "mean_token_accuracy": 0.9765287268161774, + "num_tokens": 6655745.0, + "step": 2850 + }, + { + "entropy": 0.09308060238137841, + "epoch": 7.755020080321285, + "grad_norm": 2.0739898681640625, + "learning_rate": 2.3967478917506556e-05, + "loss": 0.34147651672363283, + "mean_token_accuracy": 0.975026119351387, + "num_tokens": 6772303.0, + "step": 2900 + }, + { + "entropy": 0.09029730424284935, + "epoch": 7.888888888888889, + "grad_norm": 1.9244085550308228, + "learning_rate": 2.1329522212996067e-05, + "loss": 0.3376229476928711, + "mean_token_accuracy": 0.9750253957509994, + "num_tokens": 6887254.0, + "step": 2950 + }, + { + "epoch": 8.0, + "eval_entropy": 0.16788041561841965, + "eval_loss": 0.9710925817489624, + "eval_mean_token_accuracy": 0.8621352380514145, + "eval_num_tokens": 6977976.0, + "eval_runtime": 96.6722, + "eval_samples_per_second": 16.53, + "eval_steps_per_second": 2.069, + "step": 2992 + }, + { + "entropy": 0.09257136479095378, + "epoch": 8.021419009370817, + "grad_norm": 0.6420087218284607, + "learning_rate": 1.882349869467544e-05, + "loss": 0.3422011566162109, + "mean_token_accuracy": 0.9748761277608197, + "num_tokens": 6997584.0, + "step": 3000 + }, + { + "entropy": 0.08579577693715691, + "epoch": 8.15528781793842, + "grad_norm": 0.6230213046073914, + "learning_rate": 1.6454864908983872e-05, + "loss": 0.3201043319702148, + "mean_token_accuracy": 0.9772940769791603, + "num_tokens": 7109661.0, + "step": 3050 + }, + { + "entropy": 0.08380989912897348, + "epoch": 8.289156626506024, + "grad_norm": 0.7393160462379456, + "learning_rate": 1.4228778253748889e-05, + "loss": 0.31029510498046875, + "mean_token_accuracy": 0.9774167820811271, + "num_tokens": 7228951.0, + "step": 3100 + }, + { + "entropy": 0.08082627209834754, + "epoch": 8.423025435073628, + "grad_norm": 1.839464783668518, + "learning_rate": 1.2150085748610697e-05, + "loss": 0.3011078643798828, + "mean_token_accuracy": 0.9782456710934639, + "num_tokens": 7345974.0, + "step": 3150 + }, + { + "entropy": 0.0822253195475787, + "epoch": 8.556894243641231, + "grad_norm": 1.0962920188903809, + "learning_rate": 1.0223313481255313e-05, + "loss": 0.3033403205871582, + "mean_token_accuracy": 0.9777043810486794, + "num_tokens": 7465280.0, + "step": 3200 + }, + { + "entropy": 0.08191775260493159, + "epoch": 8.690763052208835, + "grad_norm": 0.9543969035148621, + "learning_rate": 8.452656752436198e-06, + "loss": 0.30089645385742186, + "mean_token_accuracy": 0.9785924726724624, + "num_tokens": 7584144.0, + "step": 3250 + }, + { + "entropy": 0.08602304834872485, + "epoch": 8.824631860776439, + "grad_norm": 1.2917355298995972, + "learning_rate": 6.841970941242257e-06, + "loss": 0.3139409637451172, + "mean_token_accuracy": 0.9771846759319306, + "num_tokens": 7698337.0, + "step": 3300 + }, + { + "entropy": 0.08643955274485052, + "epoch": 8.958500669344042, + "grad_norm": 0.7915636301040649, + "learning_rate": 5.394763110501694e-06, + "loss": 0.32275047302246096, + "mean_token_accuracy": 0.9766862055659294, + "num_tokens": 7815006.0, + "step": 3350 + }, + { + "epoch": 9.0, + "eval_entropy": 0.1494569706916809, + "eval_loss": 1.0805635452270508, + "eval_mean_token_accuracy": 0.8623243406414985, + "eval_num_tokens": 7850223.0, + "eval_runtime": 96.1431, + "eval_samples_per_second": 16.621, + "eval_steps_per_second": 2.08, + "step": 3366 + }, + { + "entropy": 0.08869564228437164, + "epoch": 9.09103078982597, + "grad_norm": 0.7001141309738159, + "learning_rate": 4.114184370600321e-06, + "loss": 0.31673063278198244, + "mean_token_accuracy": 0.9770309651138807, + "num_tokens": 7924040.0, + "step": 3400 + }, + { + "entropy": 0.08074495340697467, + "epoch": 9.224899598393574, + "grad_norm": 0.5722501277923584, + "learning_rate": 3.003023018340723e-06, + "loss": 0.29645233154296874, + "mean_token_accuracy": 0.9780940434336662, + "num_tokens": 8035493.0, + "step": 3450 + }, + { + "entropy": 0.08180667289532721, + "epoch": 9.358768406961179, + "grad_norm": 0.6047417521476746, + "learning_rate": 2.0636984657818187e-06, + "loss": 0.29764341354370116, + "mean_token_accuracy": 0.9783824861049653, + "num_tokens": 8150965.0, + "step": 3500 + }, + { + "entropy": 0.07683482679538428, + "epoch": 9.492637215528783, + "grad_norm": 2.4113316535949707, + "learning_rate": 1.298255972277725e-06, + "loss": 0.2808952713012695, + "mean_token_accuracy": 0.9798626834154129, + "num_tokens": 8273746.0, + "step": 3550 + }, + { + "entropy": 0.08143842275254429, + "epoch": 9.626506024096386, + "grad_norm": 0.8744550347328186, + "learning_rate": 7.083621911865702e-07, + "loss": 0.29585289001464843, + "mean_token_accuracy": 0.9784767204523086, + "num_tokens": 8391652.0, + "step": 3600 + }, + { + "entropy": 0.07885600795969366, + "epoch": 9.76037483266399, + "grad_norm": 1.0551655292510986, + "learning_rate": 2.953015409454723e-07, + "loss": 0.2857367134094238, + "mean_token_accuracy": 0.9793685188889504, + "num_tokens": 8509546.0, + "step": 3650 + }, + { + "entropy": 0.07381519969552755, + "epoch": 9.894243641231594, + "grad_norm": 0.7463138103485107, + "learning_rate": 5.997340841324148e-08, + "loss": 0.27086233139038085, + "mean_token_accuracy": 0.9803544229269028, + "num_tokens": 8634626.0, + "step": 3700 + }, + { + "epoch": 10.0, + "eval_entropy": 0.1379357658699155, + "eval_loss": 1.1639536619186401, + "eval_mean_token_accuracy": 0.8618501043319702, + "eval_num_tokens": 8722470.0, + "eval_runtime": 96.5903, + "eval_samples_per_second": 16.544, + "eval_steps_per_second": 2.071, + "step": 3740 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.9668458004157363e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/README.md b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/adapter_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9abe250820de6d55106ad056cc8dddd15cd6bd60 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05026173039334608, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/tokenizer_config.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/trainer_state.json b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5bb5a15342afd2aba51b80829251e253ce2ddbe2 --- /dev/null +++ b/DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-748/trainer_state.json @@ -0,0 +1,196 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 748, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.2582822921872139, + "epoch": 0.13386880856760375, + "grad_norm": 5.033235549926758, + "learning_rate": 2.1466288485778066e-05, + "loss": 5.046328735351563, + "mean_token_accuracy": 0.7502484863996506, + "num_tokens": 116199.0, + "step": 50 + }, + { + "entropy": 0.5685664692521095, + "epoch": 0.2677376171352075, + "grad_norm": 3.554903030395508, + "learning_rate": 4.337066449167405e-05, + "loss": 2.209185791015625, + "mean_token_accuracy": 0.8581047981977463, + "num_tokens": 232864.0, + "step": 100 + }, + { + "entropy": 0.5092987871170044, + "epoch": 0.40160642570281124, + "grad_norm": 5.598084926605225, + "learning_rate": 6.527504049757005e-05, + "loss": 2.0146630859375, + "mean_token_accuracy": 0.8670356649160386, + "num_tokens": 352382.0, + "step": 150 + }, + { + "entropy": 0.4832092320919037, + "epoch": 0.535475234270415, + "grad_norm": 3.579439163208008, + "learning_rate": 8.717941650346603e-05, + "loss": 1.9066175842285156, + "mean_token_accuracy": 0.8735517236590385, + "num_tokens": 474532.0, + "step": 200 + }, + { + "entropy": 0.49370287612080577, + "epoch": 0.6693440428380187, + "grad_norm": 22.771854400634766, + "learning_rate": 0.00010908379250936202, + "loss": 1.9317852783203124, + "mean_token_accuracy": 0.8717742815613747, + "num_tokens": 589198.0, + "step": 250 + }, + { + "entropy": 0.475419160425663, + "epoch": 0.8032128514056225, + "grad_norm": 191.79444885253906, + "learning_rate": 0.000130988168515258, + "loss": 1.9097901916503905, + "mean_token_accuracy": 0.8744278407096863, + "num_tokens": 707057.0, + "step": 300 + }, + { + "entropy": 0.5022796393930912, + "epoch": 0.9370816599732262, + "grad_norm": 3.216644287109375, + "learning_rate": 0.00015289254452115398, + "loss": 2.147085876464844, + "mean_token_accuracy": 0.869993035197258, + "num_tokens": 822888.0, + "step": 350 + }, + { + "epoch": 1.0, + "eval_entropy": 0.5250271247327327, + "eval_loss": 0.5184861421585083, + "eval_mean_token_accuracy": 0.8612929663062096, + "eval_num_tokens": 872247.0, + "eval_runtime": 96.6309, + "eval_samples_per_second": 16.537, + "eval_steps_per_second": 2.07, + "step": 374 + }, + { + "entropy": 0.4808884654382263, + "epoch": 1.069611780455154, + "grad_norm": 2.666057586669922, + "learning_rate": 0.00016382243255158818, + "loss": 1.8543489074707031, + "mean_token_accuracy": 0.8768212256407497, + "num_tokens": 929365.0, + "step": 400 + }, + { + "entropy": 0.45330303743481637, + "epoch": 1.2034805890227578, + "grad_norm": 2.1171557903289795, + "learning_rate": 0.00016364410560779942, + "loss": 1.7948956298828125, + "mean_token_accuracy": 0.8791207140684127, + "num_tokens": 1046629.0, + "step": 450 + }, + { + "entropy": 0.45347678795456886, + "epoch": 1.3373493975903614, + "grad_norm": 4.511318683624268, + "learning_rate": 0.00016328784000438723, + "loss": 1.7988812255859374, + "mean_token_accuracy": 0.8801145932078361, + "num_tokens": 1165744.0, + "step": 500 + }, + { + "entropy": 0.44952490359544756, + "epoch": 1.4712182061579653, + "grad_norm": 2.961987257003784, + "learning_rate": 0.0001627544114642431, + "loss": 1.7823495483398437, + "mean_token_accuracy": 0.8799195346236229, + "num_tokens": 1284843.0, + "step": 550 + }, + { + "entropy": 0.4502506497502327, + "epoch": 1.605087014725569, + "grad_norm": 2.924865484237671, + "learning_rate": 0.000162044981459947, + "loss": 1.7603852844238281, + "mean_token_accuracy": 0.8811277949810028, + "num_tokens": 1406485.0, + "step": 600 + }, + { + "entropy": 0.44528300017118455, + "epoch": 1.7389558232931726, + "grad_norm": 2.928840160369873, + "learning_rate": 0.00016116109468480906, + "loss": 1.7513160705566406, + "mean_token_accuracy": 0.8816375133395195, + "num_tokens": 1525460.0, + "step": 650 + }, + { + "entropy": 0.43578719861805437, + "epoch": 1.8728246318607764, + "grad_norm": 15.26456356048584, + "learning_rate": 0.00016010467568949708, + "loss": 1.7112632751464845, + "mean_token_accuracy": 0.884103564620018, + "num_tokens": 1638984.0, + "step": 700 + }, + { + "epoch": 2.0, + "eval_entropy": 0.4700849764049053, + "eval_loss": 0.49145790934562683, + "eval_mean_token_accuracy": 0.8681637060642242, + "eval_num_tokens": 1744494.0, + "eval_runtime": 96.5704, + "eval_samples_per_second": 16.548, + "eval_steps_per_second": 2.071, + "step": 748 + } + ], + "logging_steps": 50, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.920219300598998e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-780/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-780/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-780/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-780/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-780/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..91db40851463ac42dcf1b728985e59b9df3a916e --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-780/trainer_state.json @@ -0,0 +1,853 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.8811104405552204, + "eval_steps": 20, + "global_step": 780, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.519332302187689e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5b4310a836237d93697a8e1296dc66ee7402753f --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-80/trainer_state.json @@ -0,0 +1,118 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.19312009656004828, + "eval_steps": 20, + "global_step": 80, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.67938648931657e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af969bbe591de1f41dc419698acbbee08a8ce3e6 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-800/trainer_state.json @@ -0,0 +1,874 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9293904646952322, + "eval_steps": 20, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + }, + { + "entropy": 0.5075355738401413, + "epoch": 1.9293904646952322, + "grad_norm": 1.8813495635986328, + "learning_rate": 0.00013213952490880468, + "loss": 1.9060043334960937, + "mean_token_accuracy": 0.8672933347523213, + "num_tokens": 1881345.0, + "step": 800 + }, + { + "epoch": 1.9293904646952322, + "eval_entropy": 0.5167921193864908, + "eval_loss": 0.5141814947128296, + "eval_mean_token_accuracy": 0.8620959691117319, + "eval_num_tokens": 1881345.0, + "eval_runtime": 90.7632, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 800 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.696819086240467e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..27c339610106ea1b0b907a3c0edc1d1f847efe0b --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-820/trainer_state.json @@ -0,0 +1,895 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9776704888352445, + "eval_steps": 20, + "global_step": 820, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + }, + { + "entropy": 0.5075355738401413, + "epoch": 1.9293904646952322, + "grad_norm": 1.8813495635986328, + "learning_rate": 0.00013213952490880468, + "loss": 1.9060043334960937, + "mean_token_accuracy": 0.8672933347523213, + "num_tokens": 1881345.0, + "step": 800 + }, + { + "epoch": 1.9293904646952322, + "eval_entropy": 0.5167921193864908, + "eval_loss": 0.5141814947128296, + "eval_mean_token_accuracy": 0.8620959691117319, + "eval_num_tokens": 1881345.0, + "eval_runtime": 90.7632, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 800 + }, + { + "entropy": 0.5104774657636881, + "epoch": 1.9776704888352445, + "grad_norm": 2.2347588539123535, + "learning_rate": 0.0001317682976826996, + "loss": 1.9154193878173829, + "mean_token_accuracy": 0.8677295126020909, + "num_tokens": 1926308.0, + "step": 820 + }, + { + "epoch": 1.9776704888352445, + "eval_entropy": 0.4975446199768045, + "eval_loss": 0.5171827077865601, + "eval_mean_token_accuracy": 0.8614644890420893, + "eval_num_tokens": 1926308.0, + "eval_runtime": 90.7332, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 820 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.855600064591891e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..27c34ad7a3cb0e784238bb71dd4a4bdbe8b4cb09 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-840/trainer_state.json @@ -0,0 +1,916 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.024140012070006, + "eval_steps": 20, + "global_step": 840, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + }, + { + "entropy": 0.5075355738401413, + "epoch": 1.9293904646952322, + "grad_norm": 1.8813495635986328, + "learning_rate": 0.00013213952490880468, + "loss": 1.9060043334960937, + "mean_token_accuracy": 0.8672933347523213, + "num_tokens": 1881345.0, + "step": 800 + }, + { + "epoch": 1.9293904646952322, + "eval_entropy": 0.5167921193864908, + "eval_loss": 0.5141814947128296, + "eval_mean_token_accuracy": 0.8620959691117319, + "eval_num_tokens": 1881345.0, + "eval_runtime": 90.7632, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 800 + }, + { + "entropy": 0.5104774657636881, + "epoch": 1.9776704888352445, + "grad_norm": 2.2347588539123535, + "learning_rate": 0.0001317682976826996, + "loss": 1.9154193878173829, + "mean_token_accuracy": 0.8677295126020909, + "num_tokens": 1926308.0, + "step": 820 + }, + { + "epoch": 1.9776704888352445, + "eval_entropy": 0.4975446199768045, + "eval_loss": 0.5171827077865601, + "eval_mean_token_accuracy": 0.8614644890420893, + "eval_num_tokens": 1926308.0, + "eval_runtime": 90.7332, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 820 + }, + { + "entropy": 0.4617252717544506, + "epoch": 2.024140012070006, + "grad_norm": 2.3023998737335205, + "learning_rate": 0.00013137897478564603, + "loss": 1.672403907775879, + "mean_token_accuracy": 0.877363781650345, + "num_tokens": 1972496.0, + "step": 840 + }, + { + "epoch": 2.024140012070006, + "eval_entropy": 0.4930287114020144, + "eval_loss": 0.5240046977996826, + "eval_mean_token_accuracy": 0.8597234454047814, + "eval_num_tokens": 1972496.0, + "eval_runtime": 90.7242, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 840 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.031650787581724e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2d09b09e511b2ce7b883f76c17b9be4070d614c1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-860/trainer_state.json @@ -0,0 +1,937 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0724200362100182, + "eval_steps": 20, + "global_step": 860, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + }, + { + "entropy": 0.5075355738401413, + "epoch": 1.9293904646952322, + "grad_norm": 1.8813495635986328, + "learning_rate": 0.00013213952490880468, + "loss": 1.9060043334960937, + "mean_token_accuracy": 0.8672933347523213, + "num_tokens": 1881345.0, + "step": 800 + }, + { + "epoch": 1.9293904646952322, + "eval_entropy": 0.5167921193864908, + "eval_loss": 0.5141814947128296, + "eval_mean_token_accuracy": 0.8620959691117319, + "eval_num_tokens": 1881345.0, + "eval_runtime": 90.7632, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 800 + }, + { + "entropy": 0.5104774657636881, + "epoch": 1.9776704888352445, + "grad_norm": 2.2347588539123535, + "learning_rate": 0.0001317682976826996, + "loss": 1.9154193878173829, + "mean_token_accuracy": 0.8677295126020909, + "num_tokens": 1926308.0, + "step": 820 + }, + { + "epoch": 1.9776704888352445, + "eval_entropy": 0.4975446199768045, + "eval_loss": 0.5171827077865601, + "eval_mean_token_accuracy": 0.8614644890420893, + "eval_num_tokens": 1926308.0, + "eval_runtime": 90.7332, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 820 + }, + { + "entropy": 0.4617252717544506, + "epoch": 2.024140012070006, + "grad_norm": 2.3023998737335205, + "learning_rate": 0.00013137897478564603, + "loss": 1.672403907775879, + "mean_token_accuracy": 0.877363781650345, + "num_tokens": 1972496.0, + "step": 840 + }, + { + "epoch": 2.024140012070006, + "eval_entropy": 0.4930287114020144, + "eval_loss": 0.5240046977996826, + "eval_mean_token_accuracy": 0.8597234454047814, + "eval_num_tokens": 1972496.0, + "eval_runtime": 90.7242, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 840 + }, + { + "entropy": 0.41189998425543306, + "epoch": 2.0724200362100182, + "grad_norm": 2.5921578407287598, + "learning_rate": 0.00013097166639141857, + "loss": 1.5435317993164062, + "mean_token_accuracy": 0.8864825963973999, + "num_tokens": 2020733.0, + "step": 860 + }, + { + "epoch": 2.0724200362100182, + "eval_entropy": 0.46020560820450945, + "eval_loss": 0.5281100869178772, + "eval_mean_token_accuracy": 0.8605042665861966, + "eval_num_tokens": 2020733.0, + "eval_runtime": 90.7546, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 860 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.207153734702186e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..61faa3b25c5b05f8269ca4ca5f724ecda639a30a --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-880/trainer_state.json @@ -0,0 +1,958 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.12070006035003, + "eval_steps": 20, + "global_step": 880, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + }, + { + "entropy": 0.5075355738401413, + "epoch": 1.9293904646952322, + "grad_norm": 1.8813495635986328, + "learning_rate": 0.00013213952490880468, + "loss": 1.9060043334960937, + "mean_token_accuracy": 0.8672933347523213, + "num_tokens": 1881345.0, + "step": 800 + }, + { + "epoch": 1.9293904646952322, + "eval_entropy": 0.5167921193864908, + "eval_loss": 0.5141814947128296, + "eval_mean_token_accuracy": 0.8620959691117319, + "eval_num_tokens": 1881345.0, + "eval_runtime": 90.7632, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 800 + }, + { + "entropy": 0.5104774657636881, + "epoch": 1.9776704888352445, + "grad_norm": 2.2347588539123535, + "learning_rate": 0.0001317682976826996, + "loss": 1.9154193878173829, + "mean_token_accuracy": 0.8677295126020909, + "num_tokens": 1926308.0, + "step": 820 + }, + { + "epoch": 1.9776704888352445, + "eval_entropy": 0.4975446199768045, + "eval_loss": 0.5171827077865601, + "eval_mean_token_accuracy": 0.8614644890420893, + "eval_num_tokens": 1926308.0, + "eval_runtime": 90.7332, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 820 + }, + { + "entropy": 0.4617252717544506, + "epoch": 2.024140012070006, + "grad_norm": 2.3023998737335205, + "learning_rate": 0.00013137897478564603, + "loss": 1.672403907775879, + "mean_token_accuracy": 0.877363781650345, + "num_tokens": 1972496.0, + "step": 840 + }, + { + "epoch": 2.024140012070006, + "eval_entropy": 0.4930287114020144, + "eval_loss": 0.5240046977996826, + "eval_mean_token_accuracy": 0.8597234454047814, + "eval_num_tokens": 1972496.0, + "eval_runtime": 90.7242, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 840 + }, + { + "entropy": 0.41189998425543306, + "epoch": 2.0724200362100182, + "grad_norm": 2.5921578407287598, + "learning_rate": 0.00013097166639141857, + "loss": 1.5435317993164062, + "mean_token_accuracy": 0.8864825963973999, + "num_tokens": 2020733.0, + "step": 860 + }, + { + "epoch": 2.0724200362100182, + "eval_entropy": 0.46020560820450945, + "eval_loss": 0.5281100869178772, + "eval_mean_token_accuracy": 0.8605042665861966, + "eval_num_tokens": 2020733.0, + "eval_runtime": 90.7546, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 860 + }, + { + "entropy": 0.4227153487503529, + "epoch": 2.12070006035003, + "grad_norm": 2.2209794521331787, + "learning_rate": 0.0001305464877634748, + "loss": 1.571579933166504, + "mean_token_accuracy": 0.8854078397154808, + "num_tokens": 2066856.0, + "step": 880 + }, + { + "epoch": 2.12070006035003, + "eval_entropy": 0.4408075308866715, + "eval_loss": 0.534494161605835, + "eval_mean_token_accuracy": 0.8604544247134348, + "eval_num_tokens": 2066856.0, + "eval_runtime": 90.8502, + "eval_samples_per_second": 15.63, + "eval_steps_per_second": 1.959, + "step": 880 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.368444118725458e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3d2c5fd2ded4eba006e5927e2bf24bbe8fdc8f1d --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-900/trainer_state.json @@ -0,0 +1,979 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.1689800844900424, + "eval_steps": 20, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + }, + { + "entropy": 0.5075355738401413, + "epoch": 1.9293904646952322, + "grad_norm": 1.8813495635986328, + "learning_rate": 0.00013213952490880468, + "loss": 1.9060043334960937, + "mean_token_accuracy": 0.8672933347523213, + "num_tokens": 1881345.0, + "step": 800 + }, + { + "epoch": 1.9293904646952322, + "eval_entropy": 0.5167921193864908, + "eval_loss": 0.5141814947128296, + "eval_mean_token_accuracy": 0.8620959691117319, + "eval_num_tokens": 1881345.0, + "eval_runtime": 90.7632, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 800 + }, + { + "entropy": 0.5104774657636881, + "epoch": 1.9776704888352445, + "grad_norm": 2.2347588539123535, + "learning_rate": 0.0001317682976826996, + "loss": 1.9154193878173829, + "mean_token_accuracy": 0.8677295126020909, + "num_tokens": 1926308.0, + "step": 820 + }, + { + "epoch": 1.9776704888352445, + "eval_entropy": 0.4975446199768045, + "eval_loss": 0.5171827077865601, + "eval_mean_token_accuracy": 0.8614644890420893, + "eval_num_tokens": 1926308.0, + "eval_runtime": 90.7332, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 820 + }, + { + "entropy": 0.4617252717544506, + "epoch": 2.024140012070006, + "grad_norm": 2.3023998737335205, + "learning_rate": 0.00013137897478564603, + "loss": 1.672403907775879, + "mean_token_accuracy": 0.877363781650345, + "num_tokens": 1972496.0, + "step": 840 + }, + { + "epoch": 2.024140012070006, + "eval_entropy": 0.4930287114020144, + "eval_loss": 0.5240046977996826, + "eval_mean_token_accuracy": 0.8597234454047814, + "eval_num_tokens": 1972496.0, + "eval_runtime": 90.7242, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 840 + }, + { + "entropy": 0.41189998425543306, + "epoch": 2.0724200362100182, + "grad_norm": 2.5921578407287598, + "learning_rate": 0.00013097166639141857, + "loss": 1.5435317993164062, + "mean_token_accuracy": 0.8864825963973999, + "num_tokens": 2020733.0, + "step": 860 + }, + { + "epoch": 2.0724200362100182, + "eval_entropy": 0.46020560820450945, + "eval_loss": 0.5281100869178772, + "eval_mean_token_accuracy": 0.8605042665861966, + "eval_num_tokens": 2020733.0, + "eval_runtime": 90.7546, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 860 + }, + { + "entropy": 0.4227153487503529, + "epoch": 2.12070006035003, + "grad_norm": 2.2209794521331787, + "learning_rate": 0.0001305464877634748, + "loss": 1.571579933166504, + "mean_token_accuracy": 0.8854078397154808, + "num_tokens": 2066856.0, + "step": 880 + }, + { + "epoch": 2.12070006035003, + "eval_entropy": 0.4408075308866715, + "eval_loss": 0.534494161605835, + "eval_mean_token_accuracy": 0.8604544247134348, + "eval_num_tokens": 2066856.0, + "eval_runtime": 90.8502, + "eval_samples_per_second": 15.63, + "eval_steps_per_second": 1.959, + "step": 880 + }, + { + "entropy": 0.40760905370116235, + "epoch": 2.1689800844900424, + "grad_norm": 2.571462631225586, + "learning_rate": 0.00013010355922233707, + "loss": 1.5575182914733887, + "mean_token_accuracy": 0.8846474155783653, + "num_tokens": 2117470.0, + "step": 900 + }, + { + "epoch": 2.1689800844900424, + "eval_entropy": 0.4561347976494371, + "eval_loss": 0.5359405875205994, + "eval_mean_token_accuracy": 0.8610902686467331, + "eval_num_tokens": 2117470.0, + "eval_runtime": 90.7395, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 900 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.547248525815372e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4942664a2bdebf007a6236015f858145388ebfc7 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-920/trainer_state.json @@ -0,0 +1,1000 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.2172601086300543, + "eval_steps": 20, + "global_step": 920, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + }, + { + "entropy": 0.5075355738401413, + "epoch": 1.9293904646952322, + "grad_norm": 1.8813495635986328, + "learning_rate": 0.00013213952490880468, + "loss": 1.9060043334960937, + "mean_token_accuracy": 0.8672933347523213, + "num_tokens": 1881345.0, + "step": 800 + }, + { + "epoch": 1.9293904646952322, + "eval_entropy": 0.5167921193864908, + "eval_loss": 0.5141814947128296, + "eval_mean_token_accuracy": 0.8620959691117319, + "eval_num_tokens": 1881345.0, + "eval_runtime": 90.7632, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 800 + }, + { + "entropy": 0.5104774657636881, + "epoch": 1.9776704888352445, + "grad_norm": 2.2347588539123535, + "learning_rate": 0.0001317682976826996, + "loss": 1.9154193878173829, + "mean_token_accuracy": 0.8677295126020909, + "num_tokens": 1926308.0, + "step": 820 + }, + { + "epoch": 1.9776704888352445, + "eval_entropy": 0.4975446199768045, + "eval_loss": 0.5171827077865601, + "eval_mean_token_accuracy": 0.8614644890420893, + "eval_num_tokens": 1926308.0, + "eval_runtime": 90.7332, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 820 + }, + { + "entropy": 0.4617252717544506, + "epoch": 2.024140012070006, + "grad_norm": 2.3023998737335205, + "learning_rate": 0.00013137897478564603, + "loss": 1.672403907775879, + "mean_token_accuracy": 0.877363781650345, + "num_tokens": 1972496.0, + "step": 840 + }, + { + "epoch": 2.024140012070006, + "eval_entropy": 0.4930287114020144, + "eval_loss": 0.5240046977996826, + "eval_mean_token_accuracy": 0.8597234454047814, + "eval_num_tokens": 1972496.0, + "eval_runtime": 90.7242, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 840 + }, + { + "entropy": 0.41189998425543306, + "epoch": 2.0724200362100182, + "grad_norm": 2.5921578407287598, + "learning_rate": 0.00013097166639141857, + "loss": 1.5435317993164062, + "mean_token_accuracy": 0.8864825963973999, + "num_tokens": 2020733.0, + "step": 860 + }, + { + "epoch": 2.0724200362100182, + "eval_entropy": 0.46020560820450945, + "eval_loss": 0.5281100869178772, + "eval_mean_token_accuracy": 0.8605042665861966, + "eval_num_tokens": 2020733.0, + "eval_runtime": 90.7546, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 860 + }, + { + "entropy": 0.4227153487503529, + "epoch": 2.12070006035003, + "grad_norm": 2.2209794521331787, + "learning_rate": 0.0001305464877634748, + "loss": 1.571579933166504, + "mean_token_accuracy": 0.8854078397154808, + "num_tokens": 2066856.0, + "step": 880 + }, + { + "epoch": 2.12070006035003, + "eval_entropy": 0.4408075308866715, + "eval_loss": 0.534494161605835, + "eval_mean_token_accuracy": 0.8604544247134348, + "eval_num_tokens": 2066856.0, + "eval_runtime": 90.8502, + "eval_samples_per_second": 15.63, + "eval_steps_per_second": 1.959, + "step": 880 + }, + { + "entropy": 0.40760905370116235, + "epoch": 2.1689800844900424, + "grad_norm": 2.571462631225586, + "learning_rate": 0.00013010355922233707, + "loss": 1.5575182914733887, + "mean_token_accuracy": 0.8846474155783653, + "num_tokens": 2117470.0, + "step": 900 + }, + { + "epoch": 2.1689800844900424, + "eval_entropy": 0.4561347976494371, + "eval_loss": 0.5359405875205994, + "eval_mean_token_accuracy": 0.8610902686467331, + "eval_num_tokens": 2117470.0, + "eval_runtime": 90.7395, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 900 + }, + { + "entropy": 0.4111258488148451, + "epoch": 2.2172601086300543, + "grad_norm": 1.8378095626831055, + "learning_rate": 0.00012964300611154316, + "loss": 1.538413143157959, + "mean_token_accuracy": 0.8867764480412006, + "num_tokens": 2169713.0, + "step": 920 + }, + { + "epoch": 2.2172601086300543, + "eval_entropy": 0.43873994337039046, + "eval_loss": 0.5272142887115479, + "eval_mean_token_accuracy": 0.8617157025283642, + "eval_num_tokens": 2169713.0, + "eval_runtime": 90.7602, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 920 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.733980878257937e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9076ef2215aebc28e777398a243691be95f922de --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-940/trainer_state.json @@ -0,0 +1,1021 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.2655401327700666, + "eval_steps": 20, + "global_step": 940, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + }, + { + "entropy": 0.5075355738401413, + "epoch": 1.9293904646952322, + "grad_norm": 1.8813495635986328, + "learning_rate": 0.00013213952490880468, + "loss": 1.9060043334960937, + "mean_token_accuracy": 0.8672933347523213, + "num_tokens": 1881345.0, + "step": 800 + }, + { + "epoch": 1.9293904646952322, + "eval_entropy": 0.5167921193864908, + "eval_loss": 0.5141814947128296, + "eval_mean_token_accuracy": 0.8620959691117319, + "eval_num_tokens": 1881345.0, + "eval_runtime": 90.7632, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 800 + }, + { + "entropy": 0.5104774657636881, + "epoch": 1.9776704888352445, + "grad_norm": 2.2347588539123535, + "learning_rate": 0.0001317682976826996, + "loss": 1.9154193878173829, + "mean_token_accuracy": 0.8677295126020909, + "num_tokens": 1926308.0, + "step": 820 + }, + { + "epoch": 1.9776704888352445, + "eval_entropy": 0.4975446199768045, + "eval_loss": 0.5171827077865601, + "eval_mean_token_accuracy": 0.8614644890420893, + "eval_num_tokens": 1926308.0, + "eval_runtime": 90.7332, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 820 + }, + { + "entropy": 0.4617252717544506, + "epoch": 2.024140012070006, + "grad_norm": 2.3023998737335205, + "learning_rate": 0.00013137897478564603, + "loss": 1.672403907775879, + "mean_token_accuracy": 0.877363781650345, + "num_tokens": 1972496.0, + "step": 840 + }, + { + "epoch": 2.024140012070006, + "eval_entropy": 0.4930287114020144, + "eval_loss": 0.5240046977996826, + "eval_mean_token_accuracy": 0.8597234454047814, + "eval_num_tokens": 1972496.0, + "eval_runtime": 90.7242, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 840 + }, + { + "entropy": 0.41189998425543306, + "epoch": 2.0724200362100182, + "grad_norm": 2.5921578407287598, + "learning_rate": 0.00013097166639141857, + "loss": 1.5435317993164062, + "mean_token_accuracy": 0.8864825963973999, + "num_tokens": 2020733.0, + "step": 860 + }, + { + "epoch": 2.0724200362100182, + "eval_entropy": 0.46020560820450945, + "eval_loss": 0.5281100869178772, + "eval_mean_token_accuracy": 0.8605042665861966, + "eval_num_tokens": 2020733.0, + "eval_runtime": 90.7546, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 860 + }, + { + "entropy": 0.4227153487503529, + "epoch": 2.12070006035003, + "grad_norm": 2.2209794521331787, + "learning_rate": 0.0001305464877634748, + "loss": 1.571579933166504, + "mean_token_accuracy": 0.8854078397154808, + "num_tokens": 2066856.0, + "step": 880 + }, + { + "epoch": 2.12070006035003, + "eval_entropy": 0.4408075308866715, + "eval_loss": 0.534494161605835, + "eval_mean_token_accuracy": 0.8604544247134348, + "eval_num_tokens": 2066856.0, + "eval_runtime": 90.8502, + "eval_samples_per_second": 15.63, + "eval_steps_per_second": 1.959, + "step": 880 + }, + { + "entropy": 0.40760905370116235, + "epoch": 2.1689800844900424, + "grad_norm": 2.571462631225586, + "learning_rate": 0.00013010355922233707, + "loss": 1.5575182914733887, + "mean_token_accuracy": 0.8846474155783653, + "num_tokens": 2117470.0, + "step": 900 + }, + { + "epoch": 2.1689800844900424, + "eval_entropy": 0.4561347976494371, + "eval_loss": 0.5359405875205994, + "eval_mean_token_accuracy": 0.8610902686467331, + "eval_num_tokens": 2117470.0, + "eval_runtime": 90.7395, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 900 + }, + { + "entropy": 0.4111258488148451, + "epoch": 2.2172601086300543, + "grad_norm": 1.8378095626831055, + "learning_rate": 0.00012964300611154316, + "loss": 1.538413143157959, + "mean_token_accuracy": 0.8867764480412006, + "num_tokens": 2169713.0, + "step": 920 + }, + { + "epoch": 2.2172601086300543, + "eval_entropy": 0.43873994337039046, + "eval_loss": 0.5272142887115479, + "eval_mean_token_accuracy": 0.8617157025283642, + "eval_num_tokens": 2169713.0, + "eval_runtime": 90.7602, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 920 + }, + { + "entropy": 0.4261228807270527, + "epoch": 2.2655401327700666, + "grad_norm": 2.3900320529937744, + "learning_rate": 0.0001291649587621756, + "loss": 1.58123836517334, + "mean_token_accuracy": 0.8852489396929741, + "num_tokens": 2211210.0, + "step": 940 + }, + { + "epoch": 2.2655401327700666, + "eval_entropy": 0.40514066028461027, + "eval_loss": 0.5390793681144714, + "eval_mean_token_accuracy": 0.8620827519491817, + "eval_num_tokens": 2211210.0, + "eval_runtime": 90.761, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 940 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.885825829722867e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..add9468553ae172eb4000232933e56ce392943bc --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-960/trainer_state.json @@ -0,0 +1,1042 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.3138201569100785, + "eval_steps": 20, + "global_step": 960, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + }, + { + "entropy": 0.5075355738401413, + "epoch": 1.9293904646952322, + "grad_norm": 1.8813495635986328, + "learning_rate": 0.00013213952490880468, + "loss": 1.9060043334960937, + "mean_token_accuracy": 0.8672933347523213, + "num_tokens": 1881345.0, + "step": 800 + }, + { + "epoch": 1.9293904646952322, + "eval_entropy": 0.5167921193864908, + "eval_loss": 0.5141814947128296, + "eval_mean_token_accuracy": 0.8620959691117319, + "eval_num_tokens": 1881345.0, + "eval_runtime": 90.7632, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 800 + }, + { + "entropy": 0.5104774657636881, + "epoch": 1.9776704888352445, + "grad_norm": 2.2347588539123535, + "learning_rate": 0.0001317682976826996, + "loss": 1.9154193878173829, + "mean_token_accuracy": 0.8677295126020909, + "num_tokens": 1926308.0, + "step": 820 + }, + { + "epoch": 1.9776704888352445, + "eval_entropy": 0.4975446199768045, + "eval_loss": 0.5171827077865601, + "eval_mean_token_accuracy": 0.8614644890420893, + "eval_num_tokens": 1926308.0, + "eval_runtime": 90.7332, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 820 + }, + { + "entropy": 0.4617252717544506, + "epoch": 2.024140012070006, + "grad_norm": 2.3023998737335205, + "learning_rate": 0.00013137897478564603, + "loss": 1.672403907775879, + "mean_token_accuracy": 0.877363781650345, + "num_tokens": 1972496.0, + "step": 840 + }, + { + "epoch": 2.024140012070006, + "eval_entropy": 0.4930287114020144, + "eval_loss": 0.5240046977996826, + "eval_mean_token_accuracy": 0.8597234454047814, + "eval_num_tokens": 1972496.0, + "eval_runtime": 90.7242, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 840 + }, + { + "entropy": 0.41189998425543306, + "epoch": 2.0724200362100182, + "grad_norm": 2.5921578407287598, + "learning_rate": 0.00013097166639141857, + "loss": 1.5435317993164062, + "mean_token_accuracy": 0.8864825963973999, + "num_tokens": 2020733.0, + "step": 860 + }, + { + "epoch": 2.0724200362100182, + "eval_entropy": 0.46020560820450945, + "eval_loss": 0.5281100869178772, + "eval_mean_token_accuracy": 0.8605042665861966, + "eval_num_tokens": 2020733.0, + "eval_runtime": 90.7546, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 860 + }, + { + "entropy": 0.4227153487503529, + "epoch": 2.12070006035003, + "grad_norm": 2.2209794521331787, + "learning_rate": 0.0001305464877634748, + "loss": 1.571579933166504, + "mean_token_accuracy": 0.8854078397154808, + "num_tokens": 2066856.0, + "step": 880 + }, + { + "epoch": 2.12070006035003, + "eval_entropy": 0.4408075308866715, + "eval_loss": 0.534494161605835, + "eval_mean_token_accuracy": 0.8604544247134348, + "eval_num_tokens": 2066856.0, + "eval_runtime": 90.8502, + "eval_samples_per_second": 15.63, + "eval_steps_per_second": 1.959, + "step": 880 + }, + { + "entropy": 0.40760905370116235, + "epoch": 2.1689800844900424, + "grad_norm": 2.571462631225586, + "learning_rate": 0.00013010355922233707, + "loss": 1.5575182914733887, + "mean_token_accuracy": 0.8846474155783653, + "num_tokens": 2117470.0, + "step": 900 + }, + { + "epoch": 2.1689800844900424, + "eval_entropy": 0.4561347976494371, + "eval_loss": 0.5359405875205994, + "eval_mean_token_accuracy": 0.8610902686467331, + "eval_num_tokens": 2117470.0, + "eval_runtime": 90.7395, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 900 + }, + { + "entropy": 0.4111258488148451, + "epoch": 2.2172601086300543, + "grad_norm": 1.8378095626831055, + "learning_rate": 0.00012964300611154316, + "loss": 1.538413143157959, + "mean_token_accuracy": 0.8867764480412006, + "num_tokens": 2169713.0, + "step": 920 + }, + { + "epoch": 2.2172601086300543, + "eval_entropy": 0.43873994337039046, + "eval_loss": 0.5272142887115479, + "eval_mean_token_accuracy": 0.8617157025283642, + "eval_num_tokens": 2169713.0, + "eval_runtime": 90.7602, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 920 + }, + { + "entropy": 0.4261228807270527, + "epoch": 2.2655401327700666, + "grad_norm": 2.3900320529937744, + "learning_rate": 0.0001291649587621756, + "loss": 1.58123836517334, + "mean_token_accuracy": 0.8852489396929741, + "num_tokens": 2211210.0, + "step": 940 + }, + { + "epoch": 2.2655401327700666, + "eval_entropy": 0.40514066028461027, + "eval_loss": 0.5390793681144714, + "eval_mean_token_accuracy": 0.8620827519491817, + "eval_num_tokens": 2211210.0, + "eval_runtime": 90.761, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 940 + }, + { + "entropy": 0.4236912790685892, + "epoch": 2.3138201569100785, + "grad_norm": 1.635823130607605, + "learning_rate": 0.00012866955245597952, + "loss": 1.5851353645324706, + "mean_token_accuracy": 0.8841134652495384, + "num_tokens": 2256672.0, + "step": 960 + }, + { + "epoch": 2.3138201569100785, + "eval_entropy": 0.46641212182768277, + "eval_loss": 0.5175439119338989, + "eval_mean_token_accuracy": 0.8625401358925895, + "eval_num_tokens": 2256672.0, + "eval_runtime": 90.7723, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 960 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.048500458172792e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/README.md b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ecd8d6a3d45759ed195574ce08064042ee486ea1 --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/README.md @@ -0,0 +1,209 @@ +--- +base_model: google/gemma-4-31B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:google/gemma-4-31B +- lora +- sft +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/adapter_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f43f588183c3a6860ce09a29af1b562bae0504be --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-4-31B", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.047757012531964065, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$", + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/tokenizer_config.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf51dc7f4d936ea414099bc3bb7579d17a0184ca --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "left", + "processor_class": "Gemma4Processor", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "" +} diff --git a/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/trainer_state.json b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a5f71c3c2c0becb17274d9fc613bcd5b1021721c --- /dev/null +++ b/overgeneralisation_original_Estonian/gemma-4-31B_overgeneralisation_splits_original_features_train_overgeneralisation_splits_original_features_test1/checkpoint-980/trainer_state.json @@ -0,0 +1,1063 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.3621001810500903, + "eval_steps": 20, + "global_step": 980, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.675290709733963, + "epoch": 0.04828002414001207, + "grad_norm": 6.707607269287109, + "learning_rate": 6.210328529812303e-06, + "loss": 7.104328918457031, + "mean_token_accuracy": 0.6682514727115632, + "num_tokens": 48182.0, + "step": 20 + }, + { + "epoch": 0.04828002414001207, + "eval_entropy": 1.5423412115386363, + "eval_loss": 1.416153907775879, + "eval_mean_token_accuracy": 0.713003780734673, + "eval_num_tokens": 48182.0, + "eval_runtime": 90.8818, + "eval_samples_per_second": 15.625, + "eval_steps_per_second": 1.959, + "step": 20 + }, + { + "entropy": 1.1686139158904552, + "epoch": 0.09656004828002414, + "grad_norm": 3.5588884353637695, + "learning_rate": 1.2747516455930517e-05, + "loss": 4.294140243530274, + "mean_token_accuracy": 0.7630169309675694, + "num_tokens": 97030.0, + "step": 40 + }, + { + "epoch": 0.09656004828002414, + "eval_entropy": 0.801704225580344, + "eval_loss": 0.7841165661811829, + "eval_mean_token_accuracy": 0.8063843169908845, + "eval_num_tokens": 97030.0, + "eval_runtime": 90.7834, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 40 + }, + { + "entropy": 0.7488047637045383, + "epoch": 0.14484007242003621, + "grad_norm": 4.866708755493164, + "learning_rate": 1.9284704382048732e-05, + "loss": 2.9088117599487306, + "mean_token_accuracy": 0.8165101781487465, + "num_tokens": 144528.0, + "step": 60 + }, + { + "epoch": 0.14484007242003621, + "eval_entropy": 0.6810337137640192, + "eval_loss": 0.6656371355056763, + "eval_mean_token_accuracy": 0.8306669830606225, + "eval_num_tokens": 144528.0, + "eval_runtime": 90.8474, + "eval_samples_per_second": 15.631, + "eval_steps_per_second": 1.959, + "step": 60 + }, + { + "entropy": 0.6792228668928146, + "epoch": 0.19312009656004828, + "grad_norm": 4.510631084442139, + "learning_rate": 2.5821892308166943e-05, + "loss": 2.6342445373535157, + "mean_token_accuracy": 0.8298680819571018, + "num_tokens": 189657.0, + "step": 80 + }, + { + "epoch": 0.19312009656004828, + "eval_entropy": 0.6384875539983257, + "eval_loss": 0.6206316947937012, + "eval_mean_token_accuracy": 0.8366272945082589, + "eval_num_tokens": 189657.0, + "eval_runtime": 90.8078, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 80 + }, + { + "entropy": 0.6113388158380986, + "epoch": 0.24140012070006034, + "grad_norm": 2.513516664505005, + "learning_rate": 3.235908023428516e-05, + "loss": 2.413893127441406, + "mean_token_accuracy": 0.8396451488137245, + "num_tokens": 238869.0, + "step": 100 + }, + { + "epoch": 0.24140012070006034, + "eval_entropy": 0.6067953471387371, + "eval_loss": 0.6021680235862732, + "eval_mean_token_accuracy": 0.839132690362716, + "eval_num_tokens": 238869.0, + "eval_runtime": 90.7994, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 100 + }, + { + "entropy": 0.6011081866919994, + "epoch": 0.28968014484007243, + "grad_norm": 3.0723557472229004, + "learning_rate": 3.8896268160403376e-05, + "loss": 2.3560277938842775, + "mean_token_accuracy": 0.8405322283506393, + "num_tokens": 286432.0, + "step": 120 + }, + { + "epoch": 0.28968014484007243, + "eval_entropy": 0.5886335322696171, + "eval_loss": 0.5883614420890808, + "eval_mean_token_accuracy": 0.8427048559938923, + "eval_num_tokens": 286432.0, + "eval_runtime": 90.7823, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 120 + }, + { + "entropy": 0.5986796505749226, + "epoch": 0.33796016898008446, + "grad_norm": 2.583876609802246, + "learning_rate": 4.543345608652159e-05, + "loss": 2.3548404693603517, + "mean_token_accuracy": 0.8397360973060131, + "num_tokens": 335416.0, + "step": 140 + }, + { + "epoch": 0.33796016898008446, + "eval_entropy": 0.5859675710455755, + "eval_loss": 0.5772915482521057, + "eval_mean_token_accuracy": 0.8440543389722203, + "eval_num_tokens": 335416.0, + "eval_runtime": 90.755, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 140 + }, + { + "entropy": 0.5869754277169704, + "epoch": 0.38624019312009655, + "grad_norm": 2.9026308059692383, + "learning_rate": 5.19706440126398e-05, + "loss": 2.320369338989258, + "mean_token_accuracy": 0.8441123567521572, + "num_tokens": 380490.0, + "step": 160 + }, + { + "epoch": 0.38624019312009655, + "eval_entropy": 0.5944042242644878, + "eval_loss": 0.5694729089736938, + "eval_mean_token_accuracy": 0.8468695527382111, + "eval_num_tokens": 380490.0, + "eval_runtime": 90.7588, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 160 + }, + { + "entropy": 0.5780520122498274, + "epoch": 0.43452021726010864, + "grad_norm": 3.3172314167022705, + "learning_rate": 5.850783193875801e-05, + "loss": 2.280506134033203, + "mean_token_accuracy": 0.8448525600135326, + "num_tokens": 429118.0, + "step": 180 + }, + { + "epoch": 0.43452021726010864, + "eval_entropy": 0.5612959178645959, + "eval_loss": 0.5575970411300659, + "eval_mean_token_accuracy": 0.8498810844474964, + "eval_num_tokens": 429118.0, + "eval_runtime": 90.7375, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 180 + }, + { + "entropy": 0.5705389507114887, + "epoch": 0.4828002414001207, + "grad_norm": 1.8956339359283447, + "learning_rate": 6.504501986487622e-05, + "loss": 2.242726516723633, + "mean_token_accuracy": 0.848711597174406, + "num_tokens": 478235.0, + "step": 200 + }, + { + "epoch": 0.4828002414001207, + "eval_entropy": 0.5524000726389081, + "eval_loss": 0.5511140823364258, + "eval_mean_token_accuracy": 0.851530607831612, + "eval_num_tokens": 478235.0, + "eval_runtime": 90.7557, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 200 + }, + { + "entropy": 0.5800516583025456, + "epoch": 0.5310802655401328, + "grad_norm": 2.2796475887298584, + "learning_rate": 7.158220779099443e-05, + "loss": 2.2988216400146486, + "mean_token_accuracy": 0.8455737859010697, + "num_tokens": 523478.0, + "step": 220 + }, + { + "epoch": 0.5310802655401328, + "eval_entropy": 0.5344762455546455, + "eval_loss": 0.5491540431976318, + "eval_mean_token_accuracy": 0.8520114234324252, + "eval_num_tokens": 523478.0, + "eval_runtime": 90.7308, + "eval_samples_per_second": 15.651, + "eval_steps_per_second": 1.962, + "step": 220 + }, + { + "entropy": 0.5515169702470303, + "epoch": 0.5793602896801449, + "grad_norm": 1.7194722890853882, + "learning_rate": 7.811939571711266e-05, + "loss": 2.1997905731201173, + "mean_token_accuracy": 0.85145553201437, + "num_tokens": 569874.0, + "step": 240 + }, + { + "epoch": 0.5793602896801449, + "eval_entropy": 0.5982093161411499, + "eval_loss": 0.550338625907898, + "eval_mean_token_accuracy": 0.852124593565973, + "eval_num_tokens": 569874.0, + "eval_runtime": 90.7467, + "eval_samples_per_second": 15.648, + "eval_steps_per_second": 1.962, + "step": 240 + }, + { + "entropy": 0.565448484942317, + "epoch": 0.627640313820157, + "grad_norm": 1.6864795684814453, + "learning_rate": 8.465658364323088e-05, + "loss": 2.228106880187988, + "mean_token_accuracy": 0.85054235085845, + "num_tokens": 614229.0, + "step": 260 + }, + { + "epoch": 0.627640313820157, + "eval_entropy": 0.5699995079737031, + "eval_loss": 0.5463655591011047, + "eval_mean_token_accuracy": 0.852450091852231, + "eval_num_tokens": 614229.0, + "eval_runtime": 90.7728, + "eval_samples_per_second": 15.643, + "eval_steps_per_second": 1.961, + "step": 260 + }, + { + "entropy": 0.5574715089052915, + "epoch": 0.6759203379601689, + "grad_norm": 2.7099924087524414, + "learning_rate": 9.119377156934908e-05, + "loss": 2.173061180114746, + "mean_token_accuracy": 0.852943730354309, + "num_tokens": 664249.0, + "step": 280 + }, + { + "epoch": 0.6759203379601689, + "eval_entropy": 0.5770252673478609, + "eval_loss": 0.5421484708786011, + "eval_mean_token_accuracy": 0.8533824799435862, + "eval_num_tokens": 664249.0, + "eval_runtime": 90.764, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 280 + }, + { + "entropy": 0.5531694382429123, + "epoch": 0.724200362100181, + "grad_norm": 2.56211519241333, + "learning_rate": 9.77309594954673e-05, + "loss": 2.1611295700073243, + "mean_token_accuracy": 0.8546892657876015, + "num_tokens": 711614.0, + "step": 300 + }, + { + "epoch": 0.724200362100181, + "eval_entropy": 0.5576409329189344, + "eval_loss": 0.5419679284095764, + "eval_mean_token_accuracy": 0.8531393000249112, + "eval_num_tokens": 711614.0, + "eval_runtime": 90.7815, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 300 + }, + { + "entropy": 0.5627521466463804, + "epoch": 0.7724803862401931, + "grad_norm": 158.44029235839844, + "learning_rate": 0.0001042681474215855, + "loss": 2.391754913330078, + "mean_token_accuracy": 0.8485012218356133, + "num_tokens": 758911.0, + "step": 320 + }, + { + "epoch": 0.7724803862401931, + "eval_entropy": 0.6003884867335973, + "eval_loss": 0.7040325403213501, + "eval_mean_token_accuracy": 0.8316127952564968, + "eval_num_tokens": 758911.0, + "eval_runtime": 90.7921, + "eval_samples_per_second": 15.64, + "eval_steps_per_second": 1.961, + "step": 320 + }, + { + "entropy": 0.5796094480901957, + "epoch": 0.8207604103802052, + "grad_norm": 7.587340354919434, + "learning_rate": 0.00011080533534770373, + "loss": 2.458403968811035, + "mean_token_accuracy": 0.8445835530757904, + "num_tokens": 809011.0, + "step": 340 + }, + { + "epoch": 0.8207604103802052, + "eval_entropy": 0.5516570319285553, + "eval_loss": 0.5431923270225525, + "eval_mean_token_accuracy": 0.8532732303222913, + "eval_num_tokens": 809011.0, + "eval_runtime": 90.7991, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 340 + }, + { + "entropy": 0.5793778888881207, + "epoch": 0.8690404345202173, + "grad_norm": 2.124638319015503, + "learning_rate": 0.00011734252327382194, + "loss": 2.2603307723999024, + "mean_token_accuracy": 0.8511219322681427, + "num_tokens": 851557.0, + "step": 360 + }, + { + "epoch": 0.8690404345202173, + "eval_entropy": 0.560486475570818, + "eval_loss": 0.5465312600135803, + "eval_mean_token_accuracy": 0.8535054861829522, + "eval_num_tokens": 851557.0, + "eval_runtime": 90.7552, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 360 + }, + { + "entropy": 0.567094936594367, + "epoch": 0.9173204586602294, + "grad_norm": 2.3157570362091064, + "learning_rate": 0.00012387971119994014, + "loss": 2.233865737915039, + "mean_token_accuracy": 0.8490609914064408, + "num_tokens": 894234.0, + "step": 380 + }, + { + "epoch": 0.9173204586602294, + "eval_entropy": 0.5316838782824828, + "eval_loss": 0.5352600812911987, + "eval_mean_token_accuracy": 0.8547654972317513, + "eval_num_tokens": 894234.0, + "eval_runtime": 90.9552, + "eval_samples_per_second": 15.612, + "eval_steps_per_second": 1.957, + "step": 380 + }, + { + "entropy": 0.5548127952963113, + "epoch": 0.9656004828002414, + "grad_norm": 3.601078748703003, + "learning_rate": 0.00013041689912605836, + "loss": 2.2153223037719725, + "mean_token_accuracy": 0.8552668362855911, + "num_tokens": 939370.0, + "step": 400 + }, + { + "epoch": 0.9656004828002414, + "eval_entropy": 0.5799920406569256, + "eval_loss": 0.5496681928634644, + "eval_mean_token_accuracy": 0.853103037630574, + "eval_num_tokens": 939370.0, + "eval_runtime": 90.7969, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 400 + }, + { + "entropy": 0.5529916116169521, + "epoch": 1.012070006035003, + "grad_norm": 2.1900832653045654, + "learning_rate": 0.00013564626559428973, + "loss": 2.0600866317749023, + "mean_token_accuracy": 0.856480234629148, + "num_tokens": 985127.0, + "step": 420 + }, + { + "epoch": 1.012070006035003, + "eval_entropy": 0.5062234095308218, + "eval_loss": 0.5424100756645203, + "eval_mean_token_accuracy": 0.8541433596878909, + "eval_num_tokens": 985127.0, + "eval_runtime": 90.8162, + "eval_samples_per_second": 15.636, + "eval_steps_per_second": 1.96, + "step": 420 + }, + { + "entropy": 0.4908415086567402, + "epoch": 1.060350030175015, + "grad_norm": 2.2977170944213867, + "learning_rate": 0.00013563283050733522, + "loss": 1.9583213806152344, + "mean_token_accuracy": 0.8643453657627106, + "num_tokens": 1035652.0, + "step": 440 + }, + { + "epoch": 1.060350030175015, + "eval_entropy": 0.5066900360450316, + "eval_loss": 0.5420679450035095, + "eval_mean_token_accuracy": 0.8551041915845335, + "eval_num_tokens": 1035652.0, + "eval_runtime": 90.8096, + "eval_samples_per_second": 15.637, + "eval_steps_per_second": 1.96, + "step": 440 + }, + { + "entropy": 0.50622633472085, + "epoch": 1.1086300543150271, + "grad_norm": 2.5061802864074707, + "learning_rate": 0.00013560020613235583, + "loss": 1.9980314254760743, + "mean_token_accuracy": 0.8637742318212986, + "num_tokens": 1082852.0, + "step": 460 + }, + { + "epoch": 1.1086300543150271, + "eval_entropy": 0.5188879335529348, + "eval_loss": 0.5445871949195862, + "eval_mean_token_accuracy": 0.8549745655461644, + "eval_num_tokens": 1082852.0, + "eval_runtime": 90.9655, + "eval_samples_per_second": 15.61, + "eval_steps_per_second": 1.957, + "step": 460 + }, + { + "entropy": 0.5019329734146595, + "epoch": 1.1569100784550392, + "grad_norm": 2.253516912460327, + "learning_rate": 0.0001355484017016638, + "loss": 1.9593570709228516, + "mean_token_accuracy": 0.8636295884847641, + "num_tokens": 1131836.0, + "step": 480 + }, + { + "epoch": 1.1569100784550392, + "eval_entropy": 0.4907115553871969, + "eval_loss": 0.5450211763381958, + "eval_mean_token_accuracy": 0.8554045839256115, + "eval_num_tokens": 1131836.0, + "eval_runtime": 91.0455, + "eval_samples_per_second": 15.597, + "eval_steps_per_second": 1.955, + "step": 480 + }, + { + "entropy": 0.5109445530921221, + "epoch": 1.2051901025950513, + "grad_norm": 10.47754192352295, + "learning_rate": 0.00013547743187530023, + "loss": 2.0416118621826174, + "mean_token_accuracy": 0.8610585704445839, + "num_tokens": 1176544.0, + "step": 500 + }, + { + "epoch": 1.2051901025950513, + "eval_entropy": 0.5329894945862588, + "eval_loss": 0.5426890254020691, + "eval_mean_token_accuracy": 0.8550159998154372, + "eval_num_tokens": 1176544.0, + "eval_runtime": 90.7977, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 500 + }, + { + "entropy": 0.529351257160306, + "epoch": 1.2534701267350634, + "grad_norm": 2.3251631259918213, + "learning_rate": 0.00013538731673688647, + "loss": 2.035448455810547, + "mean_token_accuracy": 0.8615639433264732, + "num_tokens": 1224767.0, + "step": 520 + }, + { + "epoch": 1.2534701267350634, + "eval_entropy": 0.5154926207628143, + "eval_loss": 0.5380744338035583, + "eval_mean_token_accuracy": 0.8570477728093608, + "eval_num_tokens": 1224767.0, + "eval_runtime": 90.9006, + "eval_samples_per_second": 15.621, + "eval_steps_per_second": 1.958, + "step": 520 + }, + { + "entropy": 0.5304025936871767, + "epoch": 1.3017501508750755, + "grad_norm": 2.1253819465637207, + "learning_rate": 0.00013527808178794075, + "loss": 1.9914405822753907, + "mean_token_accuracy": 0.8642974093556404, + "num_tokens": 1272629.0, + "step": 540 + }, + { + "epoch": 1.3017501508750755, + "eval_entropy": 0.5014389195803846, + "eval_loss": 0.5321570038795471, + "eval_mean_token_accuracy": 0.8578029737043916, + "eval_num_tokens": 1272629.0, + "eval_runtime": 90.8317, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 540 + }, + { + "entropy": 0.5210890706628561, + "epoch": 1.3500301750150876, + "grad_norm": 2.370936393737793, + "learning_rate": 0.00013514975794066148, + "loss": 1.9768535614013671, + "mean_token_accuracy": 0.8633426748216152, + "num_tokens": 1318908.0, + "step": 560 + }, + { + "epoch": 1.3500301750150876, + "eval_entropy": 0.527289214428891, + "eval_loss": 0.5302034020423889, + "eval_mean_token_accuracy": 0.8576852588841085, + "eval_num_tokens": 1318908.0, + "eval_runtime": 90.9133, + "eval_samples_per_second": 15.619, + "eval_steps_per_second": 1.958, + "step": 560 + }, + { + "entropy": 0.5380321107804775, + "epoch": 1.3983101991550995, + "grad_norm": 2.9873898029327393, + "learning_rate": 0.00013500238150917956, + "loss": 2.024580192565918, + "mean_token_accuracy": 0.8618835039436817, + "num_tokens": 1360949.0, + "step": 580 + }, + { + "epoch": 1.3983101991550995, + "eval_entropy": 0.5204530746749277, + "eval_loss": 0.5321171879768372, + "eval_mean_token_accuracy": 0.8571079852205984, + "eval_num_tokens": 1360949.0, + "eval_runtime": 90.8323, + "eval_samples_per_second": 15.633, + "eval_steps_per_second": 1.96, + "step": 580 + }, + { + "entropy": 0.5245487812906504, + "epoch": 1.4465902232951118, + "grad_norm": 1.9314465522766113, + "learning_rate": 0.00013483599419928177, + "loss": 2.007284164428711, + "mean_token_accuracy": 0.8627093754708767, + "num_tokens": 1407135.0, + "step": 600 + }, + { + "epoch": 1.4465902232951118, + "eval_entropy": 0.536725418453806, + "eval_loss": 0.5315413475036621, + "eval_mean_token_accuracy": 0.8581455457076598, + "eval_num_tokens": 1407135.0, + "eval_runtime": 90.7502, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 600 + }, + { + "entropy": 0.5325499556958675, + "epoch": 1.4948702474351236, + "grad_norm": 2.1466152667999268, + "learning_rate": 0.00013465064309660862, + "loss": 2.0100082397460937, + "mean_token_accuracy": 0.8619302660226822, + "num_tokens": 1454219.0, + "step": 620 + }, + { + "epoch": 1.4948702474351236, + "eval_entropy": 0.5285820202546173, + "eval_loss": 0.5281327366828918, + "eval_mean_token_accuracy": 0.8574312443143866, + "eval_num_tokens": 1454219.0, + "eval_runtime": 90.7975, + "eval_samples_per_second": 15.639, + "eval_steps_per_second": 1.96, + "step": 620 + }, + { + "entropy": 0.5270347118377685, + "epoch": 1.5431502715751357, + "grad_norm": 1.972477912902832, + "learning_rate": 0.00013444638065332972, + "loss": 2.0097970962524414, + "mean_token_accuracy": 0.8616458527743817, + "num_tokens": 1500879.0, + "step": 640 + }, + { + "epoch": 1.5431502715751357, + "eval_entropy": 0.5531984363379103, + "eval_loss": 0.525027871131897, + "eval_mean_token_accuracy": 0.8590488440535041, + "eval_num_tokens": 1500879.0, + "eval_runtime": 90.8289, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 640 + }, + { + "entropy": 0.5264006167650223, + "epoch": 1.5914302957151478, + "grad_norm": 2.101114273071289, + "learning_rate": 0.00013422326467330028, + "loss": 2.003971481323242, + "mean_token_accuracy": 0.8630450166761875, + "num_tokens": 1547565.0, + "step": 660 + }, + { + "epoch": 1.5914302957151478, + "eval_entropy": 0.4910608320758584, + "eval_loss": 0.5248087644577026, + "eval_mean_token_accuracy": 0.8599436738517847, + "eval_num_tokens": 1547565.0, + "eval_runtime": 91.0328, + "eval_samples_per_second": 15.599, + "eval_steps_per_second": 1.955, + "step": 660 + }, + { + "entropy": 0.5071224015206098, + "epoch": 1.63971031985516, + "grad_norm": 2.1309502124786377, + "learning_rate": 0.00013398135829570344, + "loss": 1.9901405334472657, + "mean_token_accuracy": 0.8636759266257286, + "num_tokens": 1593600.0, + "step": 680 + }, + { + "epoch": 1.63971031985516, + "eval_entropy": 0.5047111117772842, + "eval_loss": 0.5270171165466309, + "eval_mean_token_accuracy": 0.8586233539527721, + "eval_num_tokens": 1593600.0, + "eval_runtime": 90.8264, + "eval_samples_per_second": 15.634, + "eval_steps_per_second": 1.96, + "step": 680 + }, + { + "entropy": 0.517396530508995, + "epoch": 1.687990343995172, + "grad_norm": 2.6385438442230225, + "learning_rate": 0.00013372072997718266, + "loss": 2.0036354064941406, + "mean_token_accuracy": 0.8638267777860165, + "num_tokens": 1642224.0, + "step": 700 + }, + { + "epoch": 1.687990343995172, + "eval_entropy": 0.49953744317708393, + "eval_loss": 0.5215877890586853, + "eval_mean_token_accuracy": 0.859384286269713, + "eval_num_tokens": 1642224.0, + "eval_runtime": 90.8569, + "eval_samples_per_second": 15.629, + "eval_steps_per_second": 1.959, + "step": 700 + }, + { + "entropy": 0.5117329221218825, + "epoch": 1.736270368135184, + "grad_norm": 1.6593103408813477, + "learning_rate": 0.00013344145347246906, + "loss": 2.003920555114746, + "mean_token_accuracy": 0.8636307917535305, + "num_tokens": 1693392.0, + "step": 720 + }, + { + "epoch": 1.736270368135184, + "eval_entropy": 0.5288207604644004, + "eval_loss": 0.5156714916229248, + "eval_mean_token_accuracy": 0.8617460369394067, + "eval_num_tokens": 1693392.0, + "eval_runtime": 90.7698, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 720 + }, + { + "entropy": 0.5143411785364151, + "epoch": 1.7845503922751962, + "grad_norm": 2.080177068710327, + "learning_rate": 0.00013314360781350998, + "loss": 1.994948959350586, + "mean_token_accuracy": 0.8643602155148983, + "num_tokens": 1742358.0, + "step": 740 + }, + { + "epoch": 1.7845503922751962, + "eval_entropy": 0.5050565709223908, + "eval_loss": 0.5188468098640442, + "eval_mean_token_accuracy": 0.8601690252845207, + "eval_num_tokens": 1742358.0, + "eval_runtime": 90.7641, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 740 + }, + { + "entropy": 0.5174011919647455, + "epoch": 1.832830416415208, + "grad_norm": 3.259908676147461, + "learning_rate": 0.00013282727728710375, + "loss": 1.9772701263427734, + "mean_token_accuracy": 0.8646314896643161, + "num_tokens": 1786930.0, + "step": 760 + }, + { + "epoch": 1.832830416415208, + "eval_entropy": 0.4937750380695536, + "eval_loss": 0.5224619507789612, + "eval_mean_token_accuracy": 0.8592762418007582, + "eval_num_tokens": 1786930.0, + "eval_runtime": 90.7224, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 760 + }, + { + "entropy": 0.5243690617382526, + "epoch": 1.8811104405552204, + "grad_norm": 2.209547519683838, + "learning_rate": 0.00013249255141104747, + "loss": 2.0030281066894533, + "mean_token_accuracy": 0.8628844127058983, + "num_tokens": 1833956.0, + "step": 780 + }, + { + "epoch": 1.8811104405552204, + "eval_entropy": 0.5570755493774843, + "eval_loss": 0.5178046226501465, + "eval_mean_token_accuracy": 0.8601498302449001, + "eval_num_tokens": 1833956.0, + "eval_runtime": 90.7399, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 780 + }, + { + "entropy": 0.5075355738401413, + "epoch": 1.9293904646952322, + "grad_norm": 1.8813495635986328, + "learning_rate": 0.00013213952490880468, + "loss": 1.9060043334960937, + "mean_token_accuracy": 0.8672933347523213, + "num_tokens": 1881345.0, + "step": 800 + }, + { + "epoch": 1.9293904646952322, + "eval_entropy": 0.5167921193864908, + "eval_loss": 0.5141814947128296, + "eval_mean_token_accuracy": 0.8620959691117319, + "eval_num_tokens": 1881345.0, + "eval_runtime": 90.7632, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 800 + }, + { + "entropy": 0.5104774657636881, + "epoch": 1.9776704888352445, + "grad_norm": 2.2347588539123535, + "learning_rate": 0.0001317682976826996, + "loss": 1.9154193878173829, + "mean_token_accuracy": 0.8677295126020909, + "num_tokens": 1926308.0, + "step": 820 + }, + { + "epoch": 1.9776704888352445, + "eval_entropy": 0.4975446199768045, + "eval_loss": 0.5171827077865601, + "eval_mean_token_accuracy": 0.8614644890420893, + "eval_num_tokens": 1926308.0, + "eval_runtime": 90.7332, + "eval_samples_per_second": 15.65, + "eval_steps_per_second": 1.962, + "step": 820 + }, + { + "entropy": 0.4617252717544506, + "epoch": 2.024140012070006, + "grad_norm": 2.3023998737335205, + "learning_rate": 0.00013137897478564603, + "loss": 1.672403907775879, + "mean_token_accuracy": 0.877363781650345, + "num_tokens": 1972496.0, + "step": 840 + }, + { + "epoch": 2.024140012070006, + "eval_entropy": 0.4930287114020144, + "eval_loss": 0.5240046977996826, + "eval_mean_token_accuracy": 0.8597234454047814, + "eval_num_tokens": 1972496.0, + "eval_runtime": 90.7242, + "eval_samples_per_second": 15.652, + "eval_steps_per_second": 1.962, + "step": 840 + }, + { + "entropy": 0.41189998425543306, + "epoch": 2.0724200362100182, + "grad_norm": 2.5921578407287598, + "learning_rate": 0.00013097166639141857, + "loss": 1.5435317993164062, + "mean_token_accuracy": 0.8864825963973999, + "num_tokens": 2020733.0, + "step": 860 + }, + { + "epoch": 2.0724200362100182, + "eval_entropy": 0.46020560820450945, + "eval_loss": 0.5281100869178772, + "eval_mean_token_accuracy": 0.8605042665861966, + "eval_num_tokens": 2020733.0, + "eval_runtime": 90.7546, + "eval_samples_per_second": 15.647, + "eval_steps_per_second": 1.961, + "step": 860 + }, + { + "entropy": 0.4227153487503529, + "epoch": 2.12070006035003, + "grad_norm": 2.2209794521331787, + "learning_rate": 0.0001305464877634748, + "loss": 1.571579933166504, + "mean_token_accuracy": 0.8854078397154808, + "num_tokens": 2066856.0, + "step": 880 + }, + { + "epoch": 2.12070006035003, + "eval_entropy": 0.4408075308866715, + "eval_loss": 0.534494161605835, + "eval_mean_token_accuracy": 0.8604544247134348, + "eval_num_tokens": 2066856.0, + "eval_runtime": 90.8502, + "eval_samples_per_second": 15.63, + "eval_steps_per_second": 1.959, + "step": 880 + }, + { + "entropy": 0.40760905370116235, + "epoch": 2.1689800844900424, + "grad_norm": 2.571462631225586, + "learning_rate": 0.00013010355922233707, + "loss": 1.5575182914733887, + "mean_token_accuracy": 0.8846474155783653, + "num_tokens": 2117470.0, + "step": 900 + }, + { + "epoch": 2.1689800844900424, + "eval_entropy": 0.4561347976494371, + "eval_loss": 0.5359405875205994, + "eval_mean_token_accuracy": 0.8610902686467331, + "eval_num_tokens": 2117470.0, + "eval_runtime": 90.7395, + "eval_samples_per_second": 15.649, + "eval_steps_per_second": 1.962, + "step": 900 + }, + { + "entropy": 0.4111258488148451, + "epoch": 2.2172601086300543, + "grad_norm": 1.8378095626831055, + "learning_rate": 0.00012964300611154316, + "loss": 1.538413143157959, + "mean_token_accuracy": 0.8867764480412006, + "num_tokens": 2169713.0, + "step": 920 + }, + { + "epoch": 2.2172601086300543, + "eval_entropy": 0.43873994337039046, + "eval_loss": 0.5272142887115479, + "eval_mean_token_accuracy": 0.8617157025283642, + "eval_num_tokens": 2169713.0, + "eval_runtime": 90.7602, + "eval_samples_per_second": 15.646, + "eval_steps_per_second": 1.961, + "step": 920 + }, + { + "entropy": 0.4261228807270527, + "epoch": 2.2655401327700666, + "grad_norm": 2.3900320529937744, + "learning_rate": 0.0001291649587621756, + "loss": 1.58123836517334, + "mean_token_accuracy": 0.8852489396929741, + "num_tokens": 2211210.0, + "step": 940 + }, + { + "epoch": 2.2655401327700666, + "eval_entropy": 0.40514066028461027, + "eval_loss": 0.5390793681144714, + "eval_mean_token_accuracy": 0.8620827519491817, + "eval_num_tokens": 2211210.0, + "eval_runtime": 90.761, + "eval_samples_per_second": 15.645, + "eval_steps_per_second": 1.961, + "step": 940 + }, + { + "entropy": 0.4236912790685892, + "epoch": 2.3138201569100785, + "grad_norm": 1.635823130607605, + "learning_rate": 0.00012866955245597952, + "loss": 1.5851353645324706, + "mean_token_accuracy": 0.8841134652495384, + "num_tokens": 2256672.0, + "step": 960 + }, + { + "epoch": 2.3138201569100785, + "eval_entropy": 0.46641212182768277, + "eval_loss": 0.5175439119338989, + "eval_mean_token_accuracy": 0.8625401358925895, + "eval_num_tokens": 2256672.0, + "eval_runtime": 90.7723, + "eval_samples_per_second": 15.644, + "eval_steps_per_second": 1.961, + "step": 960 + }, + { + "entropy": 0.4201499901711941, + "epoch": 2.3621001810500903, + "grad_norm": 2.2045373916625977, + "learning_rate": 0.0001281569273870795, + "loss": 1.603045654296875, + "mean_token_accuracy": 0.8843393631279468, + "num_tokens": 2302305.0, + "step": 980 + }, + { + "epoch": 2.3621001810500903, + "eval_entropy": 0.45320005028435356, + "eval_loss": 0.5284702181816101, + "eval_mean_token_accuracy": 0.8614657628402281, + "eval_num_tokens": 2302305.0, + "eval_runtime": 90.784, + "eval_samples_per_second": 15.642, + "eval_steps_per_second": 1.961, + "step": 980 + } + ], + "logging_steps": 20, + "max_steps": 4150, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.214631996501044e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}