| evaluations: |
| arc_challenge_poly_pt_acc: 0.564957264957265 |
| arc_challenge_poly_pt_acc_norm: 0.6034188034188034 |
| arc_challenge_poly_pt_acc_norm_stderr: 0.014307647225117459 |
| arc_challenge_poly_pt_acc_stderr: 0.014499949963905044 |
| arc_challenge_poly_pt_alias: arc_challenge_poly_pt |
| assin2_rte_acc,all: 0.9268790849673203 |
| assin2_rte_acc_stderr,all: 0.0037334279295552935 |
| assin2_rte_alias: assin2_rte |
| assin2_rte_f1_macro,all: 0.9275512463143912 |
| assin2_rte_f1_macro_stderr,all: 0.00371420765851098 |
| assin2_sts_alias: assin2_sts |
| assin2_sts_mse,all: 1.0666135620915032 |
| assin2_sts_mse_stderr,all: N/A |
| assin2_sts_pearson,all: 0.7025111393506918 |
| assin2_sts_pearson_stderr,all: 0.007270347714019143 |
| assin_entailment_acc: 0.729 |
| assin_entailment_acc_stderr: 0.00702866278356924 |
| assin_entailment_alias: assin_entailment |
| assin_paraphrase_acc: 0.70175 |
| assin_paraphrase_acc_stderr: 0.007234453587153363 |
| assin_paraphrase_alias: assin_paraphrase |
| belebele_por_Latn_acc: 0.8522222222222222 |
| belebele_por_Latn_acc_norm: 0.8522222222222222 |
| belebele_por_Latn_acc_norm_stderr: 0.011835896183094718 |
| belebele_por_Latn_acc_stderr: 0.011835896183094718 |
| belebele_por_Latn_alias: belebele_por_Latn |
| bluex_acc,all: 0.6453407510431154 |
| bluex_acc,exam_id__UNICAMP_2018: 0.5740740740740741 |
| bluex_acc,exam_id__UNICAMP_2019: 0.66 |
| bluex_acc,exam_id__UNICAMP_2020: 0.6545454545454545 |
| bluex_acc,exam_id__UNICAMP_2021_1: 0.5217391304347826 |
| bluex_acc,exam_id__UNICAMP_2021_2: 0.5686274509803921 |
| bluex_acc,exam_id__UNICAMP_2022: 0.7692307692307693 |
| bluex_acc,exam_id__UNICAMP_2023: 0.6976744186046512 |
| bluex_acc,exam_id__UNICAMP_2024: 0.6444444444444445 |
| bluex_acc,exam_id__USP_2018: 0.5555555555555556 |
| bluex_acc,exam_id__USP_2019: 0.6 |
| bluex_acc,exam_id__USP_2020: 0.6428571428571429 |
| bluex_acc,exam_id__USP_2021: 0.75 |
| bluex_acc,exam_id__USP_2022: 0.6122448979591837 |
| bluex_acc,exam_id__USP_2023: 0.7272727272727273 |
| bluex_acc,exam_id__USP_2024: 0.7560975609756098 |
| bluex_acc_stderr,all: 0.01030518267625725 |
| bluex_acc_stderr,exam_id__UNICAMP_2018: 0.03890130999517633 |
| bluex_acc_stderr,exam_id__UNICAMP_2019: 0.038803737151553275 |
| bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03685887038229413 |
| bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.04271355498682148 |
| bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.0399126533750554 |
| bluex_acc_stderr,exam_id__UNICAMP_2022: 0.03889086167730691 |
| bluex_acc_stderr,exam_id__UNICAMP_2023: 0.040270225038111215 |
| bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04120765210662655 |
| bluex_acc_stderr,exam_id__USP_2018: 0.039103130248058574 |
| bluex_acc_stderr,exam_id__USP_2019: 0.04468770985550053 |
| bluex_acc_stderr,exam_id__USP_2020: 0.036928680498947423 |
| bluex_acc_stderr,exam_id__USP_2021: 0.03469119064952554 |
| bluex_acc_stderr,exam_id__USP_2022: 0.0401889704113241 |
| bluex_acc_stderr,exam_id__USP_2023: 0.03875202753310343 |
| bluex_acc_stderr,exam_id__USP_2024: 0.0386460634988709 |
| bluex_alias: bluex |
| calame_pt_acc: 0.49710982658959535 |
| calame_pt_acc_stderr: 0.010976242623017822 |
| calame_pt_alias: calame_pt |
| calame_pt_perplexity: 9.89153235647868 |
| calame_pt_perplexity_stderr: 0.5360166925656309 |
| enem_challenge_acc,all: 0.7291812456263121 |
| enem_challenge_acc,exam_id__2009: 0.6695652173913044 |
| enem_challenge_acc,exam_id__2010: 0.7948717948717948 |
| enem_challenge_acc,exam_id__2011: 0.8034188034188035 |
| enem_challenge_acc,exam_id__2012: 0.7672413793103449 |
| enem_challenge_acc,exam_id__2013: 0.7685185185185185 |
| enem_challenge_acc,exam_id__2014: 0.7339449541284404 |
| enem_challenge_acc,exam_id__2015: 0.7142857142857143 |
| enem_challenge_acc,exam_id__2016: 0.7024793388429752 |
| enem_challenge_acc,exam_id__2016_2: 0.6910569105691057 |
| enem_challenge_acc,exam_id__2017: 0.6810344827586207 |
| enem_challenge_acc,exam_id__2022: 0.6842105263157895 |
| enem_challenge_acc,exam_id__2023: 0.7481481481481481 |
| enem_challenge_acc_stderr,all: 0.006807641973418005 |
| enem_challenge_acc_stderr,exam_id__2009: 0.02528804994180274 |
| enem_challenge_acc_stderr,exam_id__2010: 0.02149640668210639 |
| enem_challenge_acc_stderr,exam_id__2011: 0.021210692080503628 |
| enem_challenge_acc_stderr,exam_id__2012: 0.022660217278927836 |
| enem_challenge_acc_stderr,exam_id__2013: 0.02336336231167537 |
| enem_challenge_acc_stderr,exam_id__2014: 0.024490632130058823 |
| enem_challenge_acc_stderr,exam_id__2015: 0.023816151603437377 |
| enem_challenge_acc_stderr,exam_id__2016: 0.023907462731025793 |
| enem_challenge_acc_stderr,exam_id__2016_2: 0.024064368893731206 |
| enem_challenge_acc_stderr,exam_id__2017: 0.024896265014785066 |
| enem_challenge_acc_stderr,exam_id__2022: 0.023240775852571182 |
| enem_challenge_acc_stderr,exam_id__2023: 0.021584384605568642 |
| enem_challenge_alias: enem |
| faquad_nli_acc,all: 0.8584615384615385 |
| faquad_nli_acc_stderr,all: 0.00964563979812997 |
| faquad_nli_alias: faquad_nli |
| faquad_nli_f1_macro,all: 0.7500687682235792 |
| faquad_nli_f1_macro_stderr,all: 0.016061769703906006 |
| global_piqa_completions_por_latn_braz_acc: 0.84 |
| global_piqa_completions_por_latn_braz_acc_bytes: 0.78 |
| global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.041633319989322654 |
| global_piqa_completions_por_latn_braz_acc_norm: 0.8 |
| global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.04020151261036849 |
| global_piqa_completions_por_latn_braz_acc_stderr: 0.03684529491774706 |
| global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz |
| gsm8k_pt_alias: gsm8k_pt |
| gsm8k_pt_exact_match,flexible-extract: 0.5380517503805176 |
| gsm8k_pt_exact_match,strict-match: 0.0 |
| gsm8k_pt_exact_match_stderr,flexible-extract: 0.013758665878004294 |
| gsm8k_pt_exact_match_stderr,strict-match: 0.0 |
| hatebr_offensive_acc,all: 0.8035714285714286 |
| hatebr_offensive_acc_stderr,all: 0.007485688659514488 |
| hatebr_offensive_alias: hatebr_offensive_binary |
| hatebr_offensive_f1_macro,all: 0.7971547930214926 |
| hatebr_offensive_f1_macro_stderr,all: 0.0077008767186672504 |
| hellaswag_poly_pt_acc: 0.4855347274894355 |
| hellaswag_poly_pt_acc_norm: 0.6400476758045291 |
| hellaswag_poly_pt_acc_norm_stderr: 0.0049966031862365996 |
| hellaswag_poly_pt_acc_stderr: 0.00520276713464127 |
| hellaswag_poly_pt_alias: hellaswag_poly_pt |
| humaneval_instruct_alias: humaneval_instruct |
| humaneval_instruct_pass@1,create_test: 0.47560975609756095 |
| humaneval_instruct_pass@1_stderr,create_test: 0.039116399837036665 |
| ifeval_pt_alias: ifeval_pt |
| ifeval_pt_inst_level_loose_acc: 0.5209302325581395 |
| ifeval_pt_inst_level_loose_acc_stderr: N/A |
| ifeval_pt_inst_level_strict_acc: 0.48604651162790696 |
| ifeval_pt_inst_level_strict_acc_stderr: N/A |
| ifeval_pt_prompt_level_loose_acc: 0.4166666666666667 |
| ifeval_pt_prompt_level_loose_acc_stderr: 0.028511310643917525 |
| ifeval_pt_prompt_level_strict_acc: 0.38666666666666666 |
| ifeval_pt_prompt_level_strict_acc_stderr: 0.028163138908196883 |
| lambada_poly_pt_acc: 0.5893654182029886 |
| lambada_poly_pt_acc_stderr: 0.006853811533501879 |
| lambada_poly_pt_alias: lambada_poly_pt |
| lambada_poly_pt_perplexity: 10.134180916859764 |
| lambada_poly_pt_perplexity_stderr: 0.3843105646542514 |
| mmlu_poly_pt_acc: 0.6463524467126989 |
| mmlu_poly_pt_acc_stderr: 0.004142085063565535 |
| mmlu_poly_pt_alias: mmlu_poly_pt |
| oab_exams_acc,all: 0.5430523917995445 |
| oab_exams_acc,exam_id__2010-01: 0.38823529411764707 |
| oab_exams_acc,exam_id__2010-02: 0.52 |
| oab_exams_acc,exam_id__2011-03: 0.48484848484848486 |
| oab_exams_acc,exam_id__2011-04: 0.5 |
| oab_exams_acc,exam_id__2011-05: 0.6125 |
| oab_exams_acc,exam_id__2012-06: 0.55 |
| oab_exams_acc,exam_id__2012-06a: 0.6625 |
| oab_exams_acc,exam_id__2012-07: 0.55 |
| oab_exams_acc,exam_id__2012-08: 0.525 |
| oab_exams_acc,exam_id__2012-09: 0.4675324675324675 |
| oab_exams_acc,exam_id__2013-10: 0.525 |
| oab_exams_acc,exam_id__2013-11: 0.525 |
| oab_exams_acc,exam_id__2013-12: 0.5625 |
| oab_exams_acc,exam_id__2014-13: 0.5375 |
| oab_exams_acc,exam_id__2014-14: 0.525 |
| oab_exams_acc,exam_id__2014-15: 0.6282051282051282 |
| oab_exams_acc,exam_id__2015-16: 0.5 |
| oab_exams_acc,exam_id__2015-17: 0.6282051282051282 |
| oab_exams_acc,exam_id__2015-18: 0.5625 |
| oab_exams_acc,exam_id__2016-19: 0.6153846153846154 |
| oab_exams_acc,exam_id__2016-20: 0.6625 |
| oab_exams_acc,exam_id__2016-20a: 0.5125 |
| oab_exams_acc,exam_id__2016-21: 0.55 |
| oab_exams_acc,exam_id__2017-22: 0.5375 |
| oab_exams_acc,exam_id__2017-23: 0.5625 |
| oab_exams_acc,exam_id__2017-24: 0.5625 |
| oab_exams_acc,exam_id__2018-25: 0.4375 |
| oab_exams_acc_stderr,all: 0.006135349097679556 |
| oab_exams_acc_stderr,exam_id__2010-01: 0.03058058457150523 |
| oab_exams_acc_stderr,exam_id__2010-02: 0.02878728598780894 |
| oab_exams_acc_stderr,exam_id__2011-03: 0.028914895804693044 |
| oab_exams_acc_stderr,exam_id__2011-04: 0.03228642008607808 |
| oab_exams_acc_stderr,exam_id__2011-05: 0.031320378814850496 |
| oab_exams_acc_stderr,exam_id__2012-06: 0.0321418972062254 |
| oab_exams_acc_stderr,exam_id__2012-06a: 0.030467398275719 |
| oab_exams_acc_stderr,exam_id__2012-07: 0.03200736259141147 |
| oab_exams_acc_stderr,exam_id__2012-08: 0.032266439816730046 |
| oab_exams_acc_stderr,exam_id__2012-09: 0.03267111052784679 |
| oab_exams_acc_stderr,exam_id__2013-10: 0.03227871954884383 |
| oab_exams_acc_stderr,exam_id__2013-11: 0.032317288738860646 |
| oab_exams_acc_stderr,exam_id__2013-12: 0.03201360716056409 |
| oab_exams_acc_stderr,exam_id__2014-13: 0.03212888100414898 |
| oab_exams_acc_stderr,exam_id__2014-14: 0.03224480657768177 |
| oab_exams_acc_stderr,exam_id__2014-15: 0.03161561320473922 |
| oab_exams_acc_stderr,exam_id__2015-16: 0.032285769779197525 |
| oab_exams_acc_stderr,exam_id__2015-17: 0.03149277459357065 |
| oab_exams_acc_stderr,exam_id__2015-18: 0.03193661718329485 |
| oab_exams_acc_stderr,exam_id__2016-19: 0.031871219023965544 |
| oab_exams_acc_stderr,exam_id__2016-20: 0.030531876914015273 |
| oab_exams_acc_stderr,exam_id__2016-20a: 0.03228630691735584 |
| oab_exams_acc_stderr,exam_id__2016-21: 0.03207739680674974 |
| oab_exams_acc_stderr,exam_id__2017-22: 0.03209350013865321 |
| oab_exams_acc_stderr,exam_id__2017-23: 0.03217100216372156 |
| oab_exams_acc_stderr,exam_id__2017-24: 0.032070126882455134 |
| oab_exams_acc_stderr,exam_id__2018-25: 0.03198404003914565 |
| oab_exams_alias: oab_exams |
| portuguese_hate_speech_acc,all: 0.6698002350176263 |
| portuguese_hate_speech_acc_stderr,all: 0.011429241144055021 |
| portuguese_hate_speech_alias: portuguese_hate_speech_binary |
| portuguese_hate_speech_f1_macro,all: 0.6614050364958336 |
| portuguese_hate_speech_f1_macro_stderr,all: 0.01163030492382994 |
| tweetsentbr_acc,all: 0.7069651741293532 |
| tweetsentbr_acc_stderr,all: 0.007157154484263642 |
| tweetsentbr_alias: tweetsentbr |
| tweetsentbr_f1_macro,all: 0.6707907518730517 |
| tweetsentbr_f1_macro_stderr,all: 0.007569663916929406 |
| step: 69750 |
|
|