cross-model-lora-prediction-3b / results_pass1_code.json
CK0607's picture
Final workshop round: results_pass1_code.json
e357986 verified
{
"config": {
"hub_repo": "CK0607/cross-model-lora-prediction-3b",
"model_Y": "meta-llama/Llama-3.2-3B-Instruct",
"no_surrogate": true,
"generation": {
"do_sample": false,
"num_beams": 1,
"max_new_tokens": 512
},
"tasks": [
"mbpp_test_held",
"mbpp_plus"
],
"cells": "2 tasks x (base_Y, oracle, mean, global_ridge, topk8_global_ridge)"
},
"records": [
{
"cell_id": "C::mbpp_plus::base_Y",
"task": "mbpp_plus",
"method": "base_Y",
"adapter_kind": "base",
"adapter_dir": null,
"max_new_tokens": 512,
"pass1": 0.5555555555555556,
"eval_examples": 378,
"generated_examples": 378,
"unit_test_eval": true,
"evalplus_used": true,
"details_summary": {
"passed": 210,
"failed": 168,
"first_failures": [
{
"task_id": 806,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmptri4xqqa/candidate_test.py\", line 47, in <module>\n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmptri4xqqa/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n",
"code_chars": 245
},
{
"task_id": 590,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpa_k5ig0d/candidate_test.py\", line 41, in <module>\n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmpa_k5ig0d/candidate_test.py\", line 32, in assertion\n if out != exp and atol != 0:\n ^^^^^^^^^^\nValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()\n",
"code_chars": 95
},
{
"task_id": 593,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpgo1nyw9p/candidate_test.py\", line 39, in <module>\n assertion(removezero_ip(*inp), exp, 0)\n File \"/tmp/tmpgo1nyw9p/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: <generator object removezero_ip.<locals>.<genexpr>.<genexpr> at 0x2b4afc27d620>.<generator object removezero_ip.<locals>.<genexpr>.<genexpr> at 0x2b4afc27d620>.<generator object removezero_ip.<loca",
"code_chars": 101
},
{
"task_id": 294,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpnpzig0qw/candidate_test.py\", line 39, in <module>\n assertion(max_val(*inp), exp, 0)\n ^^^^^^^^^^^^^\n File \"/tmp/tmpnpzig0qw/candidate_test.py\", line 11, in max_val\n return max(lst)\n ^^^^^^^^\nTypeError: '>' not supported between instances of 'int' and 'str'\n",
"code_chars": 37
},
{
"task_id": 16,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpba__gp2n/candidate_test.py\", line 39, in <module>\n assertion(text_lowercase_underscore(*inp), exp, 0)\n File \"/tmp/tmpba__gp2n/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n",
"code_chars": 96
}
]
},
"gpu": 5,
"eval_seconds": 314.115,
"pass1_base": 0.5555555555555556,
"pass1_oracle": 0.5502645502645502,
"gap_recovered": -0.0,
"target_domain": "code"
},
{
"cell_id": "C::mbpp_plus::global_ridge",
"task": "mbpp_plus",
"method": "global_ridge",
"adapter_kind": "predicted",
"adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N24_full",
"max_new_tokens": 512,
"pass1": 0.5,
"eval_examples": 378,
"generated_examples": 378,
"unit_test_eval": true,
"evalplus_used": true,
"details_summary": {
"passed": 189,
"failed": 189,
"first_failures": [
{
"task_id": 558,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp7z1v0kjv/candidate_test.py\", line 39, in <module>\n assertion(digit_distance_nums(*inp), exp, 0)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/tmp/tmp7z1v0kjv/candidate_test.py\", line 11, in digit_distance_nums\n return abs(int(str(a)[0]) - int(str(b)[0])) + abs(int(str(a)[1]) - int(str(b)[1])) + abs(int(str(a)[2]) - int(str(b)[2]))\n ~~~~~~^^^\nIndexError: string index out of range\n",
"code_chars": 154
},
{
"task_id": 806,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpw3qi1w3a/candidate_test.py\", line 47, in <module>\n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmpw3qi1w3a/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n",
"code_chars": 245
},
{
"task_id": 775,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpcsvdn85j/candidate_test.py\", line 39, in <module>\n assertion(odd_position(*inp), exp, 0)\n File \"/tmp/tmpcsvdn85j/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n",
"code_chars": 100
},
{
"task_id": 141,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp1f2jp4yb/candidate_test.py\", line 44, in <module>\n assertion(pancake_sort(*inp), exp, 0)\n File \"/tmp/tmp1f2jp4yb/candidate_test.py\", line 38, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: [69, 38, 25, 79, 15], exp: [15, 25, 38, 69, 79]\n",
"code_chars": 237
},
{
"task_id": 590,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpbbf8m54x/candidate_test.py\", line 42, in <module>\n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmpbbf8m54x/candidate_test.py\", line 36, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: (-1.960930862590836, -2.2704074859237844), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n",
"code_chars": 111
}
]
},
"gpu": 0,
"eval_seconds": 372.125,
"pass1_base": 0.5555555555555556,
"pass1_oracle": 0.5502645502645502,
"gap_recovered": 10.499999999999895,
"target_domain": "code"
},
{
"cell_id": "C::mbpp_plus::mean",
"task": "mbpp_plus",
"method": "mean",
"adapter_kind": "predicted",
"adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N24_full",
"max_new_tokens": 512,
"pass1": 0.5370370370370371,
"eval_examples": 378,
"generated_examples": 378,
"unit_test_eval": true,
"evalplus_used": true,
"details_summary": {
"passed": 203,
"failed": 175,
"first_failures": [
{
"task_id": 806,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpjltmt1oi/candidate_test.py\", line 47, in <module>\n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmpjltmt1oi/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n",
"code_chars": 245
},
{
"task_id": 775,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp7ovlp6ml/candidate_test.py\", line 39, in <module>\n assertion(odd_position(*inp), exp, 0)\n File \"/tmp/tmp7ovlp6ml/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n",
"code_chars": 84
},
{
"task_id": 590,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp7tt6f0ai/candidate_test.py\", line 43, in <module>\n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmp7tt6f0ai/candidate_test.py\", line 37, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: (-1.960930862590836, -2.2704074859237844), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n",
"code_chars": 112
},
{
"task_id": 593,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp1j0nm0et/candidate_test.py\", line 39, in <module>\n assertion(removezero_ip(*inp), exp, 0)\n File \"/tmp/tmp1j0nm0et/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 0.0.0.0, exp: 0...\n",
"code_chars": 78
},
{
"task_id": 294,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpifhduwmn/candidate_test.py\", line 39, in <module>\n assertion(max_val(*inp), exp, 0)\n ^^^^^^^^^^^^^\n File \"/tmp/tmpifhduwmn/candidate_test.py\", line 11, in max_val\n return max(lst)\n ^^^^^^^^\nTypeError: '>' not supported between instances of 'int' and 'str'\n",
"code_chars": 37
}
]
},
"gpu": 7,
"eval_seconds": 419.41,
"pass1_base": 0.5555555555555556,
"pass1_oracle": 0.5502645502645502,
"gap_recovered": 3.4999999999999583,
"target_domain": "code"
},
{
"cell_id": "C::mbpp_plus::oracle",
"task": "mbpp_plus",
"method": "oracle",
"adapter_kind": "oracle",
"adapter_dir": "/workspace/round3_out/round4/Y/mbpp_plus",
"max_new_tokens": 512,
"pass1": 0.5502645502645502,
"eval_examples": 378,
"generated_examples": 378,
"unit_test_eval": true,
"evalplus_used": true,
"details_summary": {
"passed": 208,
"failed": 170,
"first_failures": [
{
"task_id": 806,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp64ns7rf1/candidate_test.py\", line 47, in <module>\n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmp64ns7rf1/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n",
"code_chars": 247
},
{
"task_id": 141,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpynhgsj0i/candidate_test.py\", line 44, in <module>\n assertion(pancake_sort(*inp), exp, 0)\n File \"/tmp/tmpynhgsj0i/candidate_test.py\", line 38, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: [12, 21, 23, 45, 78, 89, 56, 90, 67, 76, 54, 54, 76, 32, 32, 67, 89], exp: [12, 21, 23, 32, 32, 45, 54, 54, 56, 67, 67, 76, 76, 78, 89, 89, 90]\n",
"code_chars": 199
},
{
"task_id": 590,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpde362zz4/candidate_test.py\", line 40, in <module>\n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmpde362zz4/candidate_test.py\", line 34, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: ((5.0, 0.9272952180016122), (-1.960930862590836-2.2704074859237844j)), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n",
"code_chars": 104
},
{
"task_id": 593,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpe67upedn/candidate_test.py\", line 39, in <module>\n assertion(removezero_ip(*inp), exp, 0)\n File \"/tmp/tmpe67upedn/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 216.08.094.196, exp: 216.8.94.196\n",
"code_chars": 81
},
{
"task_id": 75,
"passed": false,
"error": " File \"/tmp/tmptps_2d9_/candidate_test.py\", line 11\n return tuple(x for x in test_list if all(x[i] % k == 0 for i in range(len(x)))\n ^\nSyntaxError: '(' was never closed\n",
"code_chars": 111
}
]
},
"gpu": 6,
"eval_seconds": 860.499,
"pass1_base": 0.5555555555555556,
"pass1_oracle": 0.5502645502645502,
"gap_recovered": 1.0,
"target_domain": "code"
},
{
"cell_id": "C::mbpp_plus::topk8_global_ridge",
"task": "mbpp_plus",
"method": "topk8_global_ridge",
"adapter_kind": "predicted",
"adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N24_full",
"max_new_tokens": 512,
"pass1": 0.4947089947089947,
"eval_examples": 378,
"generated_examples": 378,
"unit_test_eval": true,
"evalplus_used": true,
"details_summary": {
"passed": 187,
"failed": 191,
"first_failures": [
{
"task_id": 558,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp0220tvq0/candidate_test.py\", line 39, in <module>\n assertion(digit_distance_nums(*inp), exp, 0)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/tmp/tmp0220tvq0/candidate_test.py\", line 11, in digit_distance_nums\n return abs(int(str(a)[0]) - int(str(b)[0])) + abs(int(str(a)[1]) - int(str(b)[1])) + abs(int(str(a)[2]) - int(str(b)[2]))\n ~~~~~~^^^\nIndexError: string index out of range\n",
"code_chars": 154
},
{
"task_id": 806,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp6ovain3_/candidate_test.py\", line 47, in <module>\n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmp6ovain3_/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n",
"code_chars": 245
},
{
"task_id": 775,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpyizpe0qu/candidate_test.py\", line 39, in <module>\n assertion(odd_position(*inp), exp, 0)\n File \"/tmp/tmpyizpe0qu/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n",
"code_chars": 100
},
{
"task_id": 141,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpnqt7raox/candidate_test.py\", line 44, in <module>\n assertion(pancake_sort(*inp), exp, 0)\n File \"/tmp/tmpnqt7raox/candidate_test.py\", line 38, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: [69, 38, 25, 79, 15], exp: [15, 25, 38, 69, 79]\n",
"code_chars": 237
},
{
"task_id": 590,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp1qx22qre/candidate_test.py\", line 42, in <module>\n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmp1qx22qre/candidate_test.py\", line 36, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: (-1.960930862590836, -2.2704074859237844), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n",
"code_chars": 111
}
]
},
"gpu": 1,
"eval_seconds": 385.606,
"pass1_base": 0.5555555555555556,
"pass1_oracle": 0.5502645502645502,
"gap_recovered": 11.499999999999885,
"target_domain": "code"
},
{
"cell_id": "C::mbpp_test_held::base_Y",
"task": "mbpp_test_held",
"method": "base_Y",
"adapter_kind": "base",
"adapter_dir": null,
"max_new_tokens": 512,
"pass1": 0.68,
"eval_examples": 100,
"generated_examples": 100,
"unit_test_eval": true,
"evalplus_used": false,
"details_summary": {
"passed": 68,
"failed": 32,
"first_failures": [
{
"task_id": 72,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp8_12_t73/candidate_test.py\", line 18, in <module>\n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 145
},
{
"task_id": 77,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpy2_se6hu/candidate_test.py\", line 16, in <module>\n assert is_Diff(1212112) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 49
},
{
"task_id": 138,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp539snbcc/candidate_test.py\", line 19, in <module>\n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 119
},
{
"task_id": 143,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpnn4uxh_i/candidate_test.py\", line 15, in <module>\n assert find_lists(([1, 2, 3, 4], [5, 6, 7, 8])) == 2\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 47
},
{
"task_id": 56,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpm38tbipo/candidate_test.py\", line 15, in <module>\n assert check(70) == False\n ^^^^^^^^^\n File \"/tmp/tmpm38tbipo/candidate_test.py\", line 13, in check\n return str(n) == str(n)[::-1] + 1\n ~~~~~~~~~~~~~^~~\nTypeError: can only concatenate str (not \"int\") to str\n",
"code_chars": 51
}
]
},
"gpu": 0,
"eval_seconds": 74.648,
"pass1_base": 0.68,
"pass1_oracle": 0.62,
"gap_recovered": -0.0,
"target_domain": "code"
},
{
"cell_id": "C::mbpp_test_held::global_ridge",
"task": "mbpp_test_held",
"method": "global_ridge",
"adapter_kind": "predicted",
"adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N24_full",
"max_new_tokens": 512,
"pass1": 0.62,
"eval_examples": 100,
"generated_examples": 100,
"unit_test_eval": true,
"evalplus_used": false,
"details_summary": {
"passed": 62,
"failed": 38,
"first_failures": [
{
"task_id": 72,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpcl77akia/candidate_test.py\", line 18, in <module>\n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 110
},
{
"task_id": 137,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpxql1jc2o/candidate_test.py\", line 15, in <module>\n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 51
},
{
"task_id": 138,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpepzmyft4/candidate_test.py\", line 19, in <module>\n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 131
},
{
"task_id": 19,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp80yyqpv1/candidate_test.py\", line 15, in <module>\n assert test_duplicate(([1,2,3,4,5]))==False\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 59
},
{
"task_id": 16,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpx7kpx303/candidate_test.py\", line 15, in <module>\n assert text_lowercase_underscore(\"aab_cbbbc\")==(True)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 100
}
]
},
"gpu": 3,
"eval_seconds": 90.194,
"pass1_base": 0.68,
"pass1_oracle": 0.62,
"gap_recovered": 1.0,
"target_domain": "code"
},
{
"cell_id": "C::mbpp_test_held::mean",
"task": "mbpp_test_held",
"method": "mean",
"adapter_kind": "predicted",
"adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N24_full",
"max_new_tokens": 512,
"pass1": 0.62,
"eval_examples": 100,
"generated_examples": 100,
"unit_test_eval": true,
"evalplus_used": false,
"details_summary": {
"passed": 62,
"failed": 38,
"first_failures": [
{
"task_id": 72,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpmzr1cfth/candidate_test.py\", line 18, in <module>\n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 122
},
{
"task_id": 77,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpfst93r14/candidate_test.py\", line 16, in <module>\n assert is_Diff(1212112) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 49
},
{
"task_id": 137,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpa3ic2h3_/candidate_test.py\", line 17, in <module>\n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 122
},
{
"task_id": 138,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmphjayjee1/candidate_test.py\", line 19, in <module>\n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 119
},
{
"task_id": 143,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpgho_ozi5/candidate_test.py\", line 15, in <module>\n assert find_lists(([1, 2, 3, 4], [5, 6, 7, 8])) == 2\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 47
}
]
},
"gpu": 2,
"eval_seconds": 99.525,
"pass1_base": 0.68,
"pass1_oracle": 0.62,
"gap_recovered": 1.0,
"target_domain": "code"
},
{
"cell_id": "C::mbpp_test_held::oracle",
"task": "mbpp_test_held",
"method": "oracle",
"adapter_kind": "oracle",
"adapter_dir": "/workspace/round3_out/round4/Y/mbpp_test_held",
"max_new_tokens": 512,
"pass1": 0.62,
"eval_examples": 100,
"generated_examples": 100,
"unit_test_eval": true,
"evalplus_used": false,
"details_summary": {
"passed": 62,
"failed": 38,
"first_failures": [
{
"task_id": 72,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpodexp81h/candidate_test.py\", line 20, in <module>\n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 158
},
{
"task_id": 77,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpxaivmqy9/candidate_test.py\", line 21, in <module>\n assert is_Diff(1212112) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 152
},
{
"task_id": 137,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp24kwxxwv/candidate_test.py\", line 19, in <module>\n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 111
},
{
"task_id": 138,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp4k1en3yh/candidate_test.py\", line 21, in <module>\n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 176
},
{
"task_id": 16,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpbkeaovdv/candidate_test.py\", line 20, in <module>\n assert text_lowercase_underscore(\"Aaab_abbbc\")==(False)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 147
}
]
},
"gpu": 1,
"eval_seconds": 104.671,
"pass1_base": 0.68,
"pass1_oracle": 0.62,
"gap_recovered": 1.0,
"target_domain": "code"
},
{
"cell_id": "C::mbpp_test_held::topk8_global_ridge",
"task": "mbpp_test_held",
"method": "topk8_global_ridge",
"adapter_kind": "predicted",
"adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N24_full",
"max_new_tokens": 512,
"pass1": 0.6,
"eval_examples": 100,
"generated_examples": 100,
"unit_test_eval": true,
"evalplus_used": false,
"details_summary": {
"passed": 60,
"failed": 40,
"first_failures": [
{
"task_id": 72,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpk02lwrma/candidate_test.py\", line 18, in <module>\n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 110
},
{
"task_id": 77,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmp09golsn6/candidate_test.py\", line 19, in <module>\n assert is_Diff (12345) == False\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 140
},
{
"task_id": 137,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpablp55k9/candidate_test.py\", line 15, in <module>\n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 51
},
{
"task_id": 138,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpu8czsutd/candidate_test.py\", line 19, in <module>\n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 131
},
{
"task_id": 19,
"passed": false,
"error": "Traceback (most recent call last):\n File \"/tmp/tmpvj4cdzyb/candidate_test.py\", line 15, in <module>\n assert test_duplicate(([1,2,3,4,5]))==False\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n",
"code_chars": 59
}
]
},
"gpu": 4,
"eval_seconds": 89.27,
"pass1_base": 0.68,
"pass1_oracle": 0.62,
"gap_recovered": 1.3333333333333333,
"target_domain": "code"
}
]
}