| from mmengine.config import read_base | |
| with read_base(): | |
| # datasets | |
| from opencompass.configs.datasets.bbh.bbh_gen import bbh_datasets | |
| from opencompass.configs.datasets.commonsenseqa.commonsenseqa_7shot_cot_gen_734a22 import \ | |
| commonsenseqa_datasets | |
| from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen import \ | |
| chid_datasets | |
| from opencompass.configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets | |
| from opencompass.configs.datasets.humaneval.humaneval_gen import \ | |
| humaneval_datasets | |
| from opencompass.configs.datasets.longbench.longbench import \ | |
| longbench_datasets | |
| from opencompass.configs.datasets.truthfulqa.truthfulqa_gen import \ | |
| truthfulqa_datasets | |
| # models | |
| from opencompass.configs.models.hf_llama.hf_llama3_8b import \ | |
| models as hf_llama3_8b_model | |
| from opencompass.configs.models.others.hf_phi_2 import \ | |
| models as hf_phi_2_model | |
| from opencompass.configs.models.qwen.hf_qwen2_7b import \ | |
| models as hf_qwen2_7b_model | |
| datasets = sum([ | |
| v | |
| for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets' | |
| ], []) | |
| models = sum([v for k, v in locals().items() if k.endswith('_model')], []) | |
| work_dir = './outputs/edgellm/' | |
| # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| # dataset version metric mode phi-2_hf | |
| # ------------------------------------------- --------- ---------------- ------ ---------- | |
| # commonsense_qa c946f2 accuracy gen 65.19 | |
| # openai_humaneval 8e312c humaneval_pass@1 gen 30.49 | |
| # truthful_qa 5ddc62 rouge_max gen 0.08 | |
| # truthful_qa 5ddc62 rouge_diff gen -0.00 | |
| # truthful_qa 5ddc62 rouge_acc gen 0.41 | |
| # gsm8k 1d7fe4 accuracy gen 62.40 | |
| # chid-dev 211ee7 accuracy gen 12.87 | |
| # chid-test 211ee7 accuracy gen 14.34 | |
| # bbh - naive_average gen 59.50 | |
| # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| # dataset version metric mode Meta-Llama-3-8B_hf | |
| # ------------------------------------------- --------- ---------------- ------ -------------------- | |
| # commonsense_qa c946f2 accuracy gen 70.11 | |
| # openai_humaneval 8e312c humaneval_pass@1 gen 26.22 | |
| # truthful_qa 5ddc62 rouge_max gen 0.07 | |
| # truthful_qa 5ddc62 rouge_diff gen -0.01 | |
| # truthful_qa 5ddc62 rouge_acc gen 0.41 | |
| # gsm8k 1d7fe4 accuracy gen 55.80 | |
| # chid-dev 211ee7 accuracy gen 40.59 | |
| # chid-test 211ee7 accuracy gen 36.66 | |
| # bbh - naive_average gen 61.62 | |
| # 20240816_060452 | |
| # tabulate format | |
| # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| # dataset version metric mode qwen2-7b-hf | |
| # -------------- --------- ---------- ------ ------------- | |
| # commonsense_qa 734a22 accuracy gen 65.19 | |
| # truthful_qa 5ddc62 rouge_max gen 0.08 | |
| # truthful_qa 5ddc62 rouge_diff gen -0.02 | |
| # truthful_qa 5ddc62 rouge_acc gen 0.44 | |