Text Generation
Transformers
Safetensors
Portuguese
qwen3
text-generation-inference
conversational
Eval Results (legacy)
nicholasKluge's picture
Upload evals.yaml with huggingface_hub
a08e029 verified
evaluations:
arc_challenge_poly_pt_acc: 0.564957264957265
arc_challenge_poly_pt_acc_norm: 0.6034188034188034
arc_challenge_poly_pt_acc_norm_stderr: 0.014307647225117459
arc_challenge_poly_pt_acc_stderr: 0.014499949963905044
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
assin2_rte_acc,all: 0.9268790849673203
assin2_rte_acc_stderr,all: 0.0037334279295552935
assin2_rte_alias: assin2_rte
assin2_rte_f1_macro,all: 0.9275512463143912
assin2_rte_f1_macro_stderr,all: 0.00371420765851098
assin2_sts_alias: assin2_sts
assin2_sts_mse,all: 1.0666135620915032
assin2_sts_mse_stderr,all: N/A
assin2_sts_pearson,all: 0.7025111393506918
assin2_sts_pearson_stderr,all: 0.007270347714019143
assin_entailment_acc: 0.729
assin_entailment_acc_stderr: 0.00702866278356924
assin_entailment_alias: assin_entailment
assin_paraphrase_acc: 0.70175
assin_paraphrase_acc_stderr: 0.007234453587153363
assin_paraphrase_alias: assin_paraphrase
belebele_por_Latn_acc: 0.8522222222222222
belebele_por_Latn_acc_norm: 0.8522222222222222
belebele_por_Latn_acc_norm_stderr: 0.011835896183094718
belebele_por_Latn_acc_stderr: 0.011835896183094718
belebele_por_Latn_alias: belebele_por_Latn
bluex_acc,all: 0.6453407510431154
bluex_acc,exam_id__UNICAMP_2018: 0.5740740740740741
bluex_acc,exam_id__UNICAMP_2019: 0.66
bluex_acc,exam_id__UNICAMP_2020: 0.6545454545454545
bluex_acc,exam_id__UNICAMP_2021_1: 0.5217391304347826
bluex_acc,exam_id__UNICAMP_2021_2: 0.5686274509803921
bluex_acc,exam_id__UNICAMP_2022: 0.7692307692307693
bluex_acc,exam_id__UNICAMP_2023: 0.6976744186046512
bluex_acc,exam_id__UNICAMP_2024: 0.6444444444444445
bluex_acc,exam_id__USP_2018: 0.5555555555555556
bluex_acc,exam_id__USP_2019: 0.6
bluex_acc,exam_id__USP_2020: 0.6428571428571429
bluex_acc,exam_id__USP_2021: 0.75
bluex_acc,exam_id__USP_2022: 0.6122448979591837
bluex_acc,exam_id__USP_2023: 0.7272727272727273
bluex_acc,exam_id__USP_2024: 0.7560975609756098
bluex_acc_stderr,all: 0.01030518267625725
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.03890130999517633
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.038803737151553275
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03685887038229413
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.04271355498682148
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.0399126533750554
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.03889086167730691
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.040270225038111215
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.04120765210662655
bluex_acc_stderr,exam_id__USP_2018: 0.039103130248058574
bluex_acc_stderr,exam_id__USP_2019: 0.04468770985550053
bluex_acc_stderr,exam_id__USP_2020: 0.036928680498947423
bluex_acc_stderr,exam_id__USP_2021: 0.03469119064952554
bluex_acc_stderr,exam_id__USP_2022: 0.0401889704113241
bluex_acc_stderr,exam_id__USP_2023: 0.03875202753310343
bluex_acc_stderr,exam_id__USP_2024: 0.0386460634988709
bluex_alias: bluex
calame_pt_acc: 0.49710982658959535
calame_pt_acc_stderr: 0.010976242623017822
calame_pt_alias: calame_pt
calame_pt_perplexity: 9.89153235647868
calame_pt_perplexity_stderr: 0.5360166925656309
enem_challenge_acc,all: 0.7291812456263121
enem_challenge_acc,exam_id__2009: 0.6695652173913044
enem_challenge_acc,exam_id__2010: 0.7948717948717948
enem_challenge_acc,exam_id__2011: 0.8034188034188035
enem_challenge_acc,exam_id__2012: 0.7672413793103449
enem_challenge_acc,exam_id__2013: 0.7685185185185185
enem_challenge_acc,exam_id__2014: 0.7339449541284404
enem_challenge_acc,exam_id__2015: 0.7142857142857143
enem_challenge_acc,exam_id__2016: 0.7024793388429752
enem_challenge_acc,exam_id__2016_2: 0.6910569105691057
enem_challenge_acc,exam_id__2017: 0.6810344827586207
enem_challenge_acc,exam_id__2022: 0.6842105263157895
enem_challenge_acc,exam_id__2023: 0.7481481481481481
enem_challenge_acc_stderr,all: 0.006807641973418005
enem_challenge_acc_stderr,exam_id__2009: 0.02528804994180274
enem_challenge_acc_stderr,exam_id__2010: 0.02149640668210639
enem_challenge_acc_stderr,exam_id__2011: 0.021210692080503628
enem_challenge_acc_stderr,exam_id__2012: 0.022660217278927836
enem_challenge_acc_stderr,exam_id__2013: 0.02336336231167537
enem_challenge_acc_stderr,exam_id__2014: 0.024490632130058823
enem_challenge_acc_stderr,exam_id__2015: 0.023816151603437377
enem_challenge_acc_stderr,exam_id__2016: 0.023907462731025793
enem_challenge_acc_stderr,exam_id__2016_2: 0.024064368893731206
enem_challenge_acc_stderr,exam_id__2017: 0.024896265014785066
enem_challenge_acc_stderr,exam_id__2022: 0.023240775852571182
enem_challenge_acc_stderr,exam_id__2023: 0.021584384605568642
enem_challenge_alias: enem
faquad_nli_acc,all: 0.8584615384615385
faquad_nli_acc_stderr,all: 0.00964563979812997
faquad_nli_alias: faquad_nli
faquad_nli_f1_macro,all: 0.7500687682235792
faquad_nli_f1_macro_stderr,all: 0.016061769703906006
global_piqa_completions_por_latn_braz_acc: 0.84
global_piqa_completions_por_latn_braz_acc_bytes: 0.78
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.041633319989322654
global_piqa_completions_por_latn_braz_acc_norm: 0.8
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.04020151261036849
global_piqa_completions_por_latn_braz_acc_stderr: 0.03684529491774706
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
gsm8k_pt_alias: gsm8k_pt
gsm8k_pt_exact_match,flexible-extract: 0.5380517503805176
gsm8k_pt_exact_match,strict-match: 0.0
gsm8k_pt_exact_match_stderr,flexible-extract: 0.013758665878004294
gsm8k_pt_exact_match_stderr,strict-match: 0.0
hatebr_offensive_acc,all: 0.8035714285714286
hatebr_offensive_acc_stderr,all: 0.007485688659514488
hatebr_offensive_alias: hatebr_offensive_binary
hatebr_offensive_f1_macro,all: 0.7971547930214926
hatebr_offensive_f1_macro_stderr,all: 0.0077008767186672504
hellaswag_poly_pt_acc: 0.4855347274894355
hellaswag_poly_pt_acc_norm: 0.6400476758045291
hellaswag_poly_pt_acc_norm_stderr: 0.0049966031862365996
hellaswag_poly_pt_acc_stderr: 0.00520276713464127
hellaswag_poly_pt_alias: hellaswag_poly_pt
humaneval_instruct_alias: humaneval_instruct
humaneval_instruct_pass@1,create_test: 0.47560975609756095
humaneval_instruct_pass@1_stderr,create_test: 0.039116399837036665
ifeval_pt_alias: ifeval_pt
ifeval_pt_inst_level_loose_acc: 0.5209302325581395
ifeval_pt_inst_level_loose_acc_stderr: N/A
ifeval_pt_inst_level_strict_acc: 0.48604651162790696
ifeval_pt_inst_level_strict_acc_stderr: N/A
ifeval_pt_prompt_level_loose_acc: 0.4166666666666667
ifeval_pt_prompt_level_loose_acc_stderr: 0.028511310643917525
ifeval_pt_prompt_level_strict_acc: 0.38666666666666666
ifeval_pt_prompt_level_strict_acc_stderr: 0.028163138908196883
lambada_poly_pt_acc: 0.5893654182029886
lambada_poly_pt_acc_stderr: 0.006853811533501879
lambada_poly_pt_alias: lambada_poly_pt
lambada_poly_pt_perplexity: 10.134180916859764
lambada_poly_pt_perplexity_stderr: 0.3843105646542514
mmlu_poly_pt_acc: 0.6463524467126989
mmlu_poly_pt_acc_stderr: 0.004142085063565535
mmlu_poly_pt_alias: mmlu_poly_pt
oab_exams_acc,all: 0.5430523917995445
oab_exams_acc,exam_id__2010-01: 0.38823529411764707
oab_exams_acc,exam_id__2010-02: 0.52
oab_exams_acc,exam_id__2011-03: 0.48484848484848486
oab_exams_acc,exam_id__2011-04: 0.5
oab_exams_acc,exam_id__2011-05: 0.6125
oab_exams_acc,exam_id__2012-06: 0.55
oab_exams_acc,exam_id__2012-06a: 0.6625
oab_exams_acc,exam_id__2012-07: 0.55
oab_exams_acc,exam_id__2012-08: 0.525
oab_exams_acc,exam_id__2012-09: 0.4675324675324675
oab_exams_acc,exam_id__2013-10: 0.525
oab_exams_acc,exam_id__2013-11: 0.525
oab_exams_acc,exam_id__2013-12: 0.5625
oab_exams_acc,exam_id__2014-13: 0.5375
oab_exams_acc,exam_id__2014-14: 0.525
oab_exams_acc,exam_id__2014-15: 0.6282051282051282
oab_exams_acc,exam_id__2015-16: 0.5
oab_exams_acc,exam_id__2015-17: 0.6282051282051282
oab_exams_acc,exam_id__2015-18: 0.5625
oab_exams_acc,exam_id__2016-19: 0.6153846153846154
oab_exams_acc,exam_id__2016-20: 0.6625
oab_exams_acc,exam_id__2016-20a: 0.5125
oab_exams_acc,exam_id__2016-21: 0.55
oab_exams_acc,exam_id__2017-22: 0.5375
oab_exams_acc,exam_id__2017-23: 0.5625
oab_exams_acc,exam_id__2017-24: 0.5625
oab_exams_acc,exam_id__2018-25: 0.4375
oab_exams_acc_stderr,all: 0.006135349097679556
oab_exams_acc_stderr,exam_id__2010-01: 0.03058058457150523
oab_exams_acc_stderr,exam_id__2010-02: 0.02878728598780894
oab_exams_acc_stderr,exam_id__2011-03: 0.028914895804693044
oab_exams_acc_stderr,exam_id__2011-04: 0.03228642008607808
oab_exams_acc_stderr,exam_id__2011-05: 0.031320378814850496
oab_exams_acc_stderr,exam_id__2012-06: 0.0321418972062254
oab_exams_acc_stderr,exam_id__2012-06a: 0.030467398275719
oab_exams_acc_stderr,exam_id__2012-07: 0.03200736259141147
oab_exams_acc_stderr,exam_id__2012-08: 0.032266439816730046
oab_exams_acc_stderr,exam_id__2012-09: 0.03267111052784679
oab_exams_acc_stderr,exam_id__2013-10: 0.03227871954884383
oab_exams_acc_stderr,exam_id__2013-11: 0.032317288738860646
oab_exams_acc_stderr,exam_id__2013-12: 0.03201360716056409
oab_exams_acc_stderr,exam_id__2014-13: 0.03212888100414898
oab_exams_acc_stderr,exam_id__2014-14: 0.03224480657768177
oab_exams_acc_stderr,exam_id__2014-15: 0.03161561320473922
oab_exams_acc_stderr,exam_id__2015-16: 0.032285769779197525
oab_exams_acc_stderr,exam_id__2015-17: 0.03149277459357065
oab_exams_acc_stderr,exam_id__2015-18: 0.03193661718329485
oab_exams_acc_stderr,exam_id__2016-19: 0.031871219023965544
oab_exams_acc_stderr,exam_id__2016-20: 0.030531876914015273
oab_exams_acc_stderr,exam_id__2016-20a: 0.03228630691735584
oab_exams_acc_stderr,exam_id__2016-21: 0.03207739680674974
oab_exams_acc_stderr,exam_id__2017-22: 0.03209350013865321
oab_exams_acc_stderr,exam_id__2017-23: 0.03217100216372156
oab_exams_acc_stderr,exam_id__2017-24: 0.032070126882455134
oab_exams_acc_stderr,exam_id__2018-25: 0.03198404003914565
oab_exams_alias: oab_exams
portuguese_hate_speech_acc,all: 0.6698002350176263
portuguese_hate_speech_acc_stderr,all: 0.011429241144055021
portuguese_hate_speech_alias: portuguese_hate_speech_binary
portuguese_hate_speech_f1_macro,all: 0.6614050364958336
portuguese_hate_speech_f1_macro_stderr,all: 0.01163030492382994
tweetsentbr_acc,all: 0.7069651741293532
tweetsentbr_acc_stderr,all: 0.007157154484263642
tweetsentbr_alias: tweetsentbr
tweetsentbr_f1_macro,all: 0.6707907518730517
tweetsentbr_f1_macro_stderr,all: 0.007569663916929406
step: 69750