{ "config": { "hub_repo": "CK0607/cross-model-lora-prediction-3b", "model_Y": "meta-llama/Llama-3.2-3B-Instruct", "no_surrogate": true, "generation": { "do_sample": false, "num_beams": 1, "max_new_tokens": 512 }, "tasks": [ "mbpp_test_held", "mbpp_plus" ], "cells": "2 tasks x (base_Y, oracle, mean, global_ridge, topk8_global_ridge)" }, "records": [ { "cell_id": "C::mbpp_plus::base_Y", "task": "mbpp_plus", "method": "base_Y", "adapter_kind": "base", "adapter_dir": null, "max_new_tokens": 512, "pass1": 0.5555555555555556, "eval_examples": 378, "generated_examples": 378, "unit_test_eval": true, "evalplus_used": true, "details_summary": { "passed": 210, "failed": 168, "first_failures": [ { "task_id": 806, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmptri4xqqa/candidate_test.py\", line 47, in \n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmptri4xqqa/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n", "code_chars": 245 }, { "task_id": 590, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpa_k5ig0d/candidate_test.py\", line 41, in \n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmpa_k5ig0d/candidate_test.py\", line 32, in assertion\n if out != exp and atol != 0:\n ^^^^^^^^^^\nValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()\n", "code_chars": 95 }, { "task_id": 593, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpgo1nyw9p/candidate_test.py\", line 39, in \n assertion(removezero_ip(*inp), exp, 0)\n File \"/tmp/tmpgo1nyw9p/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: .. at 0x2b4afc27d620>... at 0x2b4afc27d620>.\n assertion(max_val(*inp), exp, 0)\n ^^^^^^^^^^^^^\n File \"/tmp/tmpnpzig0qw/candidate_test.py\", line 11, in max_val\n return max(lst)\n ^^^^^^^^\nTypeError: '>' not supported between instances of 'int' and 'str'\n", "code_chars": 37 }, { "task_id": 16, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpba__gp2n/candidate_test.py\", line 39, in \n assertion(text_lowercase_underscore(*inp), exp, 0)\n File \"/tmp/tmpba__gp2n/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n", "code_chars": 96 } ] }, "gpu": 5, "eval_seconds": 314.115, "pass1_base": 0.5555555555555556, "pass1_oracle": 0.5502645502645502, "gap_recovered": -0.0, "target_domain": "code" }, { "cell_id": "C::mbpp_plus::global_ridge", "task": "mbpp_plus", "method": "global_ridge", "adapter_kind": "predicted", "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N24_full", "max_new_tokens": 512, "pass1": 0.5, "eval_examples": 378, "generated_examples": 378, "unit_test_eval": true, "evalplus_used": true, "details_summary": { "passed": 189, "failed": 189, "first_failures": [ { "task_id": 558, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp7z1v0kjv/candidate_test.py\", line 39, in \n assertion(digit_distance_nums(*inp), exp, 0)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/tmp/tmp7z1v0kjv/candidate_test.py\", line 11, in digit_distance_nums\n return abs(int(str(a)[0]) - int(str(b)[0])) + abs(int(str(a)[1]) - int(str(b)[1])) + abs(int(str(a)[2]) - int(str(b)[2]))\n ~~~~~~^^^\nIndexError: string index out of range\n", "code_chars": 154 }, { "task_id": 806, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpw3qi1w3a/candidate_test.py\", line 47, in \n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmpw3qi1w3a/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n", "code_chars": 245 }, { "task_id": 775, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpcsvdn85j/candidate_test.py\", line 39, in \n assertion(odd_position(*inp), exp, 0)\n File \"/tmp/tmpcsvdn85j/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n", "code_chars": 100 }, { "task_id": 141, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp1f2jp4yb/candidate_test.py\", line 44, in \n assertion(pancake_sort(*inp), exp, 0)\n File \"/tmp/tmp1f2jp4yb/candidate_test.py\", line 38, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: [69, 38, 25, 79, 15], exp: [15, 25, 38, 69, 79]\n", "code_chars": 237 }, { "task_id": 590, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpbbf8m54x/candidate_test.py\", line 42, in \n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmpbbf8m54x/candidate_test.py\", line 36, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: (-1.960930862590836, -2.2704074859237844), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n", "code_chars": 111 } ] }, "gpu": 0, "eval_seconds": 372.125, "pass1_base": 0.5555555555555556, "pass1_oracle": 0.5502645502645502, "gap_recovered": 10.499999999999895, "target_domain": "code" }, { "cell_id": "C::mbpp_plus::mean", "task": "mbpp_plus", "method": "mean", "adapter_kind": "predicted", "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N24_full", "max_new_tokens": 512, "pass1": 0.5370370370370371, "eval_examples": 378, "generated_examples": 378, "unit_test_eval": true, "evalplus_used": true, "details_summary": { "passed": 203, "failed": 175, "first_failures": [ { "task_id": 806, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpjltmt1oi/candidate_test.py\", line 47, in \n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmpjltmt1oi/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n", "code_chars": 245 }, { "task_id": 775, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp7ovlp6ml/candidate_test.py\", line 39, in \n assertion(odd_position(*inp), exp, 0)\n File \"/tmp/tmp7ovlp6ml/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n", "code_chars": 84 }, { "task_id": 590, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp7tt6f0ai/candidate_test.py\", line 43, in \n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmp7tt6f0ai/candidate_test.py\", line 37, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: (-1.960930862590836, -2.2704074859237844), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n", "code_chars": 112 }, { "task_id": 593, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp1j0nm0et/candidate_test.py\", line 39, in \n assertion(removezero_ip(*inp), exp, 0)\n File \"/tmp/tmp1j0nm0et/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 0.0.0.0, exp: 0...\n", "code_chars": 78 }, { "task_id": 294, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpifhduwmn/candidate_test.py\", line 39, in \n assertion(max_val(*inp), exp, 0)\n ^^^^^^^^^^^^^\n File \"/tmp/tmpifhduwmn/candidate_test.py\", line 11, in max_val\n return max(lst)\n ^^^^^^^^\nTypeError: '>' not supported between instances of 'int' and 'str'\n", "code_chars": 37 } ] }, "gpu": 7, "eval_seconds": 419.41, "pass1_base": 0.5555555555555556, "pass1_oracle": 0.5502645502645502, "gap_recovered": 3.4999999999999583, "target_domain": "code" }, { "cell_id": "C::mbpp_plus::oracle", "task": "mbpp_plus", "method": "oracle", "adapter_kind": "oracle", "adapter_dir": "/workspace/round3_out/round4/Y/mbpp_plus", "max_new_tokens": 512, "pass1": 0.5502645502645502, "eval_examples": 378, "generated_examples": 378, "unit_test_eval": true, "evalplus_used": true, "details_summary": { "passed": 208, "failed": 170, "first_failures": [ { "task_id": 806, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp64ns7rf1/candidate_test.py\", line 47, in \n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmp64ns7rf1/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n", "code_chars": 247 }, { "task_id": 141, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpynhgsj0i/candidate_test.py\", line 44, in \n assertion(pancake_sort(*inp), exp, 0)\n File \"/tmp/tmpynhgsj0i/candidate_test.py\", line 38, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: [12, 21, 23, 45, 78, 89, 56, 90, 67, 76, 54, 54, 76, 32, 32, 67, 89], exp: [12, 21, 23, 32, 32, 45, 54, 54, 56, 67, 67, 76, 76, 78, 89, 89, 90]\n", "code_chars": 199 }, { "task_id": 590, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpde362zz4/candidate_test.py\", line 40, in \n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmpde362zz4/candidate_test.py\", line 34, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: ((5.0, 0.9272952180016122), (-1.960930862590836-2.2704074859237844j)), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n", "code_chars": 104 }, { "task_id": 593, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpe67upedn/candidate_test.py\", line 39, in \n assertion(removezero_ip(*inp), exp, 0)\n File \"/tmp/tmpe67upedn/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 216.08.094.196, exp: 216.8.94.196\n", "code_chars": 81 }, { "task_id": 75, "passed": false, "error": " File \"/tmp/tmptps_2d9_/candidate_test.py\", line 11\n return tuple(x for x in test_list if all(x[i] % k == 0 for i in range(len(x)))\n ^\nSyntaxError: '(' was never closed\n", "code_chars": 111 } ] }, "gpu": 6, "eval_seconds": 860.499, "pass1_base": 0.5555555555555556, "pass1_oracle": 0.5502645502645502, "gap_recovered": 1.0, "target_domain": "code" }, { "cell_id": "C::mbpp_plus::topk8_global_ridge", "task": "mbpp_plus", "method": "topk8_global_ridge", "adapter_kind": "predicted", "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N24_full", "max_new_tokens": 512, "pass1": 0.4947089947089947, "eval_examples": 378, "generated_examples": 378, "unit_test_eval": true, "evalplus_used": true, "details_summary": { "passed": 187, "failed": 191, "first_failures": [ { "task_id": 558, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp0220tvq0/candidate_test.py\", line 39, in \n assertion(digit_distance_nums(*inp), exp, 0)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/tmp/tmp0220tvq0/candidate_test.py\", line 11, in digit_distance_nums\n return abs(int(str(a)[0]) - int(str(b)[0])) + abs(int(str(a)[1]) - int(str(b)[1])) + abs(int(str(a)[2]) - int(str(b)[2]))\n ~~~~~~^^^\nIndexError: string index out of range\n", "code_chars": 154 }, { "task_id": 806, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp6ovain3_/candidate_test.py\", line 47, in \n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmp6ovain3_/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n", "code_chars": 245 }, { "task_id": 775, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpyizpe0qu/candidate_test.py\", line 39, in \n assertion(odd_position(*inp), exp, 0)\n File \"/tmp/tmpyizpe0qu/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n", "code_chars": 100 }, { "task_id": 141, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpnqt7raox/candidate_test.py\", line 44, in \n assertion(pancake_sort(*inp), exp, 0)\n File \"/tmp/tmpnqt7raox/candidate_test.py\", line 38, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: [69, 38, 25, 79, 15], exp: [15, 25, 38, 69, 79]\n", "code_chars": 237 }, { "task_id": 590, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp1qx22qre/candidate_test.py\", line 42, in \n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmp1qx22qre/candidate_test.py\", line 36, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: (-1.960930862590836, -2.2704074859237844), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n", "code_chars": 111 } ] }, "gpu": 1, "eval_seconds": 385.606, "pass1_base": 0.5555555555555556, "pass1_oracle": 0.5502645502645502, "gap_recovered": 11.499999999999885, "target_domain": "code" }, { "cell_id": "C::mbpp_test_held::base_Y", "task": "mbpp_test_held", "method": "base_Y", "adapter_kind": "base", "adapter_dir": null, "max_new_tokens": 512, "pass1": 0.68, "eval_examples": 100, "generated_examples": 100, "unit_test_eval": true, "evalplus_used": false, "details_summary": { "passed": 68, "failed": 32, "first_failures": [ { "task_id": 72, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp8_12_t73/candidate_test.py\", line 18, in \n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 145 }, { "task_id": 77, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpy2_se6hu/candidate_test.py\", line 16, in \n assert is_Diff(1212112) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 49 }, { "task_id": 138, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp539snbcc/candidate_test.py\", line 19, in \n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 119 }, { "task_id": 143, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpnn4uxh_i/candidate_test.py\", line 15, in \n assert find_lists(([1, 2, 3, 4], [5, 6, 7, 8])) == 2\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 47 }, { "task_id": 56, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpm38tbipo/candidate_test.py\", line 15, in \n assert check(70) == False\n ^^^^^^^^^\n File \"/tmp/tmpm38tbipo/candidate_test.py\", line 13, in check\n return str(n) == str(n)[::-1] + 1\n ~~~~~~~~~~~~~^~~\nTypeError: can only concatenate str (not \"int\") to str\n", "code_chars": 51 } ] }, "gpu": 0, "eval_seconds": 74.648, "pass1_base": 0.68, "pass1_oracle": 0.62, "gap_recovered": -0.0, "target_domain": "code" }, { "cell_id": "C::mbpp_test_held::global_ridge", "task": "mbpp_test_held", "method": "global_ridge", "adapter_kind": "predicted", "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N24_full", "max_new_tokens": 512, "pass1": 0.62, "eval_examples": 100, "generated_examples": 100, "unit_test_eval": true, "evalplus_used": false, "details_summary": { "passed": 62, "failed": 38, "first_failures": [ { "task_id": 72, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpcl77akia/candidate_test.py\", line 18, in \n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 110 }, { "task_id": 137, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpxql1jc2o/candidate_test.py\", line 15, in \n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 51 }, { "task_id": 138, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpepzmyft4/candidate_test.py\", line 19, in \n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 131 }, { "task_id": 19, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp80yyqpv1/candidate_test.py\", line 15, in \n assert test_duplicate(([1,2,3,4,5]))==False\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 59 }, { "task_id": 16, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpx7kpx303/candidate_test.py\", line 15, in \n assert text_lowercase_underscore(\"aab_cbbbc\")==(True)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 100 } ] }, "gpu": 3, "eval_seconds": 90.194, "pass1_base": 0.68, "pass1_oracle": 0.62, "gap_recovered": 1.0, "target_domain": "code" }, { "cell_id": "C::mbpp_test_held::mean", "task": "mbpp_test_held", "method": "mean", "adapter_kind": "predicted", "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N24_full", "max_new_tokens": 512, "pass1": 0.62, "eval_examples": 100, "generated_examples": 100, "unit_test_eval": true, "evalplus_used": false, "details_summary": { "passed": 62, "failed": 38, "first_failures": [ { "task_id": 72, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpmzr1cfth/candidate_test.py\", line 18, in \n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 122 }, { "task_id": 77, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpfst93r14/candidate_test.py\", line 16, in \n assert is_Diff(1212112) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 49 }, { "task_id": 137, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpa3ic2h3_/candidate_test.py\", line 17, in \n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 122 }, { "task_id": 138, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmphjayjee1/candidate_test.py\", line 19, in \n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 119 }, { "task_id": 143, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpgho_ozi5/candidate_test.py\", line 15, in \n assert find_lists(([1, 2, 3, 4], [5, 6, 7, 8])) == 2\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 47 } ] }, "gpu": 2, "eval_seconds": 99.525, "pass1_base": 0.68, "pass1_oracle": 0.62, "gap_recovered": 1.0, "target_domain": "code" }, { "cell_id": "C::mbpp_test_held::oracle", "task": "mbpp_test_held", "method": "oracle", "adapter_kind": "oracle", "adapter_dir": "/workspace/round3_out/round4/Y/mbpp_test_held", "max_new_tokens": 512, "pass1": 0.62, "eval_examples": 100, "generated_examples": 100, "unit_test_eval": true, "evalplus_used": false, "details_summary": { "passed": 62, "failed": 38, "first_failures": [ { "task_id": 72, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpodexp81h/candidate_test.py\", line 20, in \n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 158 }, { "task_id": 77, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpxaivmqy9/candidate_test.py\", line 21, in \n assert is_Diff(1212112) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 152 }, { "task_id": 137, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp24kwxxwv/candidate_test.py\", line 19, in \n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 111 }, { "task_id": 138, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp4k1en3yh/candidate_test.py\", line 21, in \n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 176 }, { "task_id": 16, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpbkeaovdv/candidate_test.py\", line 20, in \n assert text_lowercase_underscore(\"Aaab_abbbc\")==(False)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 147 } ] }, "gpu": 1, "eval_seconds": 104.671, "pass1_base": 0.68, "pass1_oracle": 0.62, "gap_recovered": 1.0, "target_domain": "code" }, { "cell_id": "C::mbpp_test_held::topk8_global_ridge", "task": "mbpp_test_held", "method": "topk8_global_ridge", "adapter_kind": "predicted", "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N24_full", "max_new_tokens": 512, "pass1": 0.6, "eval_examples": 100, "generated_examples": 100, "unit_test_eval": true, "evalplus_used": false, "details_summary": { "passed": 60, "failed": 40, "first_failures": [ { "task_id": 72, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpk02lwrma/candidate_test.py\", line 18, in \n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 110 }, { "task_id": 77, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmp09golsn6/candidate_test.py\", line 19, in \n assert is_Diff (12345) == False\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 140 }, { "task_id": 137, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpablp55k9/candidate_test.py\", line 15, in \n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 51 }, { "task_id": 138, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpu8czsutd/candidate_test.py\", line 19, in \n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 131 }, { "task_id": 19, "passed": false, "error": "Traceback (most recent call last):\n File \"/tmp/tmpvj4cdzyb/candidate_test.py\", line 15, in \n assert test_duplicate(([1,2,3,4,5]))==False\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", "code_chars": 59 } ] }, "gpu": 4, "eval_seconds": 89.27, "pass1_base": 0.68, "pass1_oracle": 0.62, "gap_recovered": 1.3333333333333333, "target_domain": "code" } ] }