| { |
| "config": { |
| "hub_repo": "CK0607/cross-model-lora-prediction-3b", |
| "model_Y": "meta-llama/Llama-3.2-3B-Instruct", |
| "no_surrogate": true, |
| "generation": { |
| "do_sample": false, |
| "num_beams": 1, |
| "max_new_tokens": 512 |
| }, |
| "tasks": [ |
| "mbpp_test_held", |
| "mbpp_plus" |
| ], |
| "cells": "2 tasks x (base_Y, oracle, mean, global_ridge, topk8_global_ridge)" |
| }, |
| "records": [ |
| { |
| "cell_id": "C::mbpp_plus::base_Y", |
| "task": "mbpp_plus", |
| "method": "base_Y", |
| "adapter_kind": "base", |
| "adapter_dir": null, |
| "max_new_tokens": 512, |
| "pass1": 0.5555555555555556, |
| "eval_examples": 378, |
| "generated_examples": 378, |
| "unit_test_eval": true, |
| "evalplus_used": true, |
| "details_summary": { |
| "passed": 210, |
| "failed": 168, |
| "first_failures": [ |
| { |
| "task_id": 806, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmptri4xqqa/candidate_test.py\", line 47, in <module>\n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmptri4xqqa/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n", |
| "code_chars": 245 |
| }, |
| { |
| "task_id": 590, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpa_k5ig0d/candidate_test.py\", line 41, in <module>\n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmpa_k5ig0d/candidate_test.py\", line 32, in assertion\n if out != exp and atol != 0:\n ^^^^^^^^^^\nValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()\n", |
| "code_chars": 95 |
| }, |
| { |
| "task_id": 593, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpgo1nyw9p/candidate_test.py\", line 39, in <module>\n assertion(removezero_ip(*inp), exp, 0)\n File \"/tmp/tmpgo1nyw9p/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: <generator object removezero_ip.<locals>.<genexpr>.<genexpr> at 0x2b4afc27d620>.<generator object removezero_ip.<locals>.<genexpr>.<genexpr> at 0x2b4afc27d620>.<generator object removezero_ip.<loca", |
| "code_chars": 101 |
| }, |
| { |
| "task_id": 294, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpnpzig0qw/candidate_test.py\", line 39, in <module>\n assertion(max_val(*inp), exp, 0)\n ^^^^^^^^^^^^^\n File \"/tmp/tmpnpzig0qw/candidate_test.py\", line 11, in max_val\n return max(lst)\n ^^^^^^^^\nTypeError: '>' not supported between instances of 'int' and 'str'\n", |
| "code_chars": 37 |
| }, |
| { |
| "task_id": 16, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpba__gp2n/candidate_test.py\", line 39, in <module>\n assertion(text_lowercase_underscore(*inp), exp, 0)\n File \"/tmp/tmpba__gp2n/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n", |
| "code_chars": 96 |
| } |
| ] |
| }, |
| "gpu": 5, |
| "eval_seconds": 314.115, |
| "pass1_base": 0.5555555555555556, |
| "pass1_oracle": 0.5502645502645502, |
| "gap_recovered": -0.0, |
| "target_domain": "code" |
| }, |
| { |
| "cell_id": "C::mbpp_plus::global_ridge", |
| "task": "mbpp_plus", |
| "method": "global_ridge", |
| "adapter_kind": "predicted", |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_global_ridge_N24_full", |
| "max_new_tokens": 512, |
| "pass1": 0.5, |
| "eval_examples": 378, |
| "generated_examples": 378, |
| "unit_test_eval": true, |
| "evalplus_used": true, |
| "details_summary": { |
| "passed": 189, |
| "failed": 189, |
| "first_failures": [ |
| { |
| "task_id": 558, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp7z1v0kjv/candidate_test.py\", line 39, in <module>\n assertion(digit_distance_nums(*inp), exp, 0)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/tmp/tmp7z1v0kjv/candidate_test.py\", line 11, in digit_distance_nums\n return abs(int(str(a)[0]) - int(str(b)[0])) + abs(int(str(a)[1]) - int(str(b)[1])) + abs(int(str(a)[2]) - int(str(b)[2]))\n ~~~~~~^^^\nIndexError: string index out of range\n", |
| "code_chars": 154 |
| }, |
| { |
| "task_id": 806, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpw3qi1w3a/candidate_test.py\", line 47, in <module>\n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmpw3qi1w3a/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n", |
| "code_chars": 245 |
| }, |
| { |
| "task_id": 775, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpcsvdn85j/candidate_test.py\", line 39, in <module>\n assertion(odd_position(*inp), exp, 0)\n File \"/tmp/tmpcsvdn85j/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n", |
| "code_chars": 100 |
| }, |
| { |
| "task_id": 141, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp1f2jp4yb/candidate_test.py\", line 44, in <module>\n assertion(pancake_sort(*inp), exp, 0)\n File \"/tmp/tmp1f2jp4yb/candidate_test.py\", line 38, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: [69, 38, 25, 79, 15], exp: [15, 25, 38, 69, 79]\n", |
| "code_chars": 237 |
| }, |
| { |
| "task_id": 590, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpbbf8m54x/candidate_test.py\", line 42, in <module>\n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmpbbf8m54x/candidate_test.py\", line 36, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: (-1.960930862590836, -2.2704074859237844), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n", |
| "code_chars": 111 |
| } |
| ] |
| }, |
| "gpu": 0, |
| "eval_seconds": 372.125, |
| "pass1_base": 0.5555555555555556, |
| "pass1_oracle": 0.5502645502645502, |
| "gap_recovered": 10.499999999999895, |
| "target_domain": "code" |
| }, |
| { |
| "cell_id": "C::mbpp_plus::mean", |
| "task": "mbpp_plus", |
| "method": "mean", |
| "adapter_kind": "predicted", |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_mean_N24_full", |
| "max_new_tokens": 512, |
| "pass1": 0.5370370370370371, |
| "eval_examples": 378, |
| "generated_examples": 378, |
| "unit_test_eval": true, |
| "evalplus_used": true, |
| "details_summary": { |
| "passed": 203, |
| "failed": 175, |
| "first_failures": [ |
| { |
| "task_id": 806, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpjltmt1oi/candidate_test.py\", line 47, in <module>\n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmpjltmt1oi/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n", |
| "code_chars": 245 |
| }, |
| { |
| "task_id": 775, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp7ovlp6ml/candidate_test.py\", line 39, in <module>\n assertion(odd_position(*inp), exp, 0)\n File \"/tmp/tmp7ovlp6ml/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n", |
| "code_chars": 84 |
| }, |
| { |
| "task_id": 590, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp7tt6f0ai/candidate_test.py\", line 43, in <module>\n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmp7tt6f0ai/candidate_test.py\", line 37, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: (-1.960930862590836, -2.2704074859237844), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n", |
| "code_chars": 112 |
| }, |
| { |
| "task_id": 593, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp1j0nm0et/candidate_test.py\", line 39, in <module>\n assertion(removezero_ip(*inp), exp, 0)\n File \"/tmp/tmp1j0nm0et/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 0.0.0.0, exp: 0...\n", |
| "code_chars": 78 |
| }, |
| { |
| "task_id": 294, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpifhduwmn/candidate_test.py\", line 39, in <module>\n assertion(max_val(*inp), exp, 0)\n ^^^^^^^^^^^^^\n File \"/tmp/tmpifhduwmn/candidate_test.py\", line 11, in max_val\n return max(lst)\n ^^^^^^^^\nTypeError: '>' not supported between instances of 'int' and 'str'\n", |
| "code_chars": 37 |
| } |
| ] |
| }, |
| "gpu": 7, |
| "eval_seconds": 419.41, |
| "pass1_base": 0.5555555555555556, |
| "pass1_oracle": 0.5502645502645502, |
| "gap_recovered": 3.4999999999999583, |
| "target_domain": "code" |
| }, |
| { |
| "cell_id": "C::mbpp_plus::oracle", |
| "task": "mbpp_plus", |
| "method": "oracle", |
| "adapter_kind": "oracle", |
| "adapter_dir": "/workspace/round3_out/round4/Y/mbpp_plus", |
| "max_new_tokens": 512, |
| "pass1": 0.5502645502645502, |
| "eval_examples": 378, |
| "generated_examples": 378, |
| "unit_test_eval": true, |
| "evalplus_used": true, |
| "details_summary": { |
| "passed": 208, |
| "failed": 170, |
| "first_failures": [ |
| { |
| "task_id": 806, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp64ns7rf1/candidate_test.py\", line 47, in <module>\n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmp64ns7rf1/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n", |
| "code_chars": 247 |
| }, |
| { |
| "task_id": 141, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpynhgsj0i/candidate_test.py\", line 44, in <module>\n assertion(pancake_sort(*inp), exp, 0)\n File \"/tmp/tmpynhgsj0i/candidate_test.py\", line 38, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: [12, 21, 23, 45, 78, 89, 56, 90, 67, 76, 54, 54, 76, 32, 32, 67, 89], exp: [12, 21, 23, 32, 32, 45, 54, 54, 56, 67, 67, 76, 76, 78, 89, 89, 90]\n", |
| "code_chars": 199 |
| }, |
| { |
| "task_id": 590, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpde362zz4/candidate_test.py\", line 40, in <module>\n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmpde362zz4/candidate_test.py\", line 34, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: ((5.0, 0.9272952180016122), (-1.960930862590836-2.2704074859237844j)), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n", |
| "code_chars": 104 |
| }, |
| { |
| "task_id": 593, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpe67upedn/candidate_test.py\", line 39, in <module>\n assertion(removezero_ip(*inp), exp, 0)\n File \"/tmp/tmpe67upedn/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 216.08.094.196, exp: 216.8.94.196\n", |
| "code_chars": 81 |
| }, |
| { |
| "task_id": 75, |
| "passed": false, |
| "error": " File \"/tmp/tmptps_2d9_/candidate_test.py\", line 11\n return tuple(x for x in test_list if all(x[i] % k == 0 for i in range(len(x)))\n ^\nSyntaxError: '(' was never closed\n", |
| "code_chars": 111 |
| } |
| ] |
| }, |
| "gpu": 6, |
| "eval_seconds": 860.499, |
| "pass1_base": 0.5555555555555556, |
| "pass1_oracle": 0.5502645502645502, |
| "gap_recovered": 1.0, |
| "target_domain": "code" |
| }, |
| { |
| "cell_id": "C::mbpp_plus::topk8_global_ridge", |
| "task": "mbpp_plus", |
| "method": "topk8_global_ridge", |
| "adapter_kind": "predicted", |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_plus_topk8_global_ridge_N24_full", |
| "max_new_tokens": 512, |
| "pass1": 0.4947089947089947, |
| "eval_examples": 378, |
| "generated_examples": 378, |
| "unit_test_eval": true, |
| "evalplus_used": true, |
| "details_summary": { |
| "passed": 187, |
| "failed": 191, |
| "first_failures": [ |
| { |
| "task_id": 558, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp0220tvq0/candidate_test.py\", line 39, in <module>\n assertion(digit_distance_nums(*inp), exp, 0)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/tmp/tmp0220tvq0/candidate_test.py\", line 11, in digit_distance_nums\n return abs(int(str(a)[0]) - int(str(b)[0])) + abs(int(str(a)[1]) - int(str(b)[1])) + abs(int(str(a)[2]) - int(str(b)[2]))\n ~~~~~~^^^\nIndexError: string index out of range\n", |
| "code_chars": 154 |
| }, |
| { |
| "task_id": 806, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp6ovain3_/candidate_test.py\", line 47, in <module>\n assertion(max_run_uppercase(*inp), exp, 0)\n File \"/tmp/tmp6ovain3_/candidate_test.py\", line 41, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: 1, exp: 0\n", |
| "code_chars": 245 |
| }, |
| { |
| "task_id": 775, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpyizpe0qu/candidate_test.py\", line 39, in <module>\n assertion(odd_position(*inp), exp, 0)\n File \"/tmp/tmpyizpe0qu/candidate_test.py\", line 33, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: False, exp: True\n", |
| "code_chars": 100 |
| }, |
| { |
| "task_id": 141, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpnqt7raox/candidate_test.py\", line 44, in <module>\n assertion(pancake_sort(*inp), exp, 0)\n File \"/tmp/tmpnqt7raox/candidate_test.py\", line 38, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: [69, 38, 25, 79, 15], exp: [15, 25, 38, 69, 79]\n", |
| "code_chars": 237 |
| }, |
| { |
| "task_id": 590, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp1qx22qre/candidate_test.py\", line 42, in <module>\n assertion(polar_rect(*inp), exp, 0)\n File \"/tmp/tmp1qx22qre/candidate_test.py\", line 36, in assertion\n assert out == exp, f\"out: {out}, exp: {exp}\"\n ^^^^^^^^^^\nAssertionError: out: (-1.960930862590836, -2.2704074859237844), exp: ((5.0, 0.9272952180016122), (-2+2.4492935982947064e-16j))\n", |
| "code_chars": 111 |
| } |
| ] |
| }, |
| "gpu": 1, |
| "eval_seconds": 385.606, |
| "pass1_base": 0.5555555555555556, |
| "pass1_oracle": 0.5502645502645502, |
| "gap_recovered": 11.499999999999885, |
| "target_domain": "code" |
| }, |
| { |
| "cell_id": "C::mbpp_test_held::base_Y", |
| "task": "mbpp_test_held", |
| "method": "base_Y", |
| "adapter_kind": "base", |
| "adapter_dir": null, |
| "max_new_tokens": 512, |
| "pass1": 0.68, |
| "eval_examples": 100, |
| "generated_examples": 100, |
| "unit_test_eval": true, |
| "evalplus_used": false, |
| "details_summary": { |
| "passed": 68, |
| "failed": 32, |
| "first_failures": [ |
| { |
| "task_id": 72, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp8_12_t73/candidate_test.py\", line 18, in <module>\n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 145 |
| }, |
| { |
| "task_id": 77, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpy2_se6hu/candidate_test.py\", line 16, in <module>\n assert is_Diff(1212112) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 49 |
| }, |
| { |
| "task_id": 138, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp539snbcc/candidate_test.py\", line 19, in <module>\n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 119 |
| }, |
| { |
| "task_id": 143, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpnn4uxh_i/candidate_test.py\", line 15, in <module>\n assert find_lists(([1, 2, 3, 4], [5, 6, 7, 8])) == 2\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 47 |
| }, |
| { |
| "task_id": 56, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpm38tbipo/candidate_test.py\", line 15, in <module>\n assert check(70) == False\n ^^^^^^^^^\n File \"/tmp/tmpm38tbipo/candidate_test.py\", line 13, in check\n return str(n) == str(n)[::-1] + 1\n ~~~~~~~~~~~~~^~~\nTypeError: can only concatenate str (not \"int\") to str\n", |
| "code_chars": 51 |
| } |
| ] |
| }, |
| "gpu": 0, |
| "eval_seconds": 74.648, |
| "pass1_base": 0.68, |
| "pass1_oracle": 0.62, |
| "gap_recovered": -0.0, |
| "target_domain": "code" |
| }, |
| { |
| "cell_id": "C::mbpp_test_held::global_ridge", |
| "task": "mbpp_test_held", |
| "method": "global_ridge", |
| "adapter_kind": "predicted", |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_global_ridge_N24_full", |
| "max_new_tokens": 512, |
| "pass1": 0.62, |
| "eval_examples": 100, |
| "generated_examples": 100, |
| "unit_test_eval": true, |
| "evalplus_used": false, |
| "details_summary": { |
| "passed": 62, |
| "failed": 38, |
| "first_failures": [ |
| { |
| "task_id": 72, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpcl77akia/candidate_test.py\", line 18, in <module>\n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 110 |
| }, |
| { |
| "task_id": 137, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpxql1jc2o/candidate_test.py\", line 15, in <module>\n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 51 |
| }, |
| { |
| "task_id": 138, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpepzmyft4/candidate_test.py\", line 19, in <module>\n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 131 |
| }, |
| { |
| "task_id": 19, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp80yyqpv1/candidate_test.py\", line 15, in <module>\n assert test_duplicate(([1,2,3,4,5]))==False\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 59 |
| }, |
| { |
| "task_id": 16, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpx7kpx303/candidate_test.py\", line 15, in <module>\n assert text_lowercase_underscore(\"aab_cbbbc\")==(True)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 100 |
| } |
| ] |
| }, |
| "gpu": 3, |
| "eval_seconds": 90.194, |
| "pass1_base": 0.68, |
| "pass1_oracle": 0.62, |
| "gap_recovered": 1.0, |
| "target_domain": "code" |
| }, |
| { |
| "cell_id": "C::mbpp_test_held::mean", |
| "task": "mbpp_test_held", |
| "method": "mean", |
| "adapter_kind": "predicted", |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_mean_N24_full", |
| "max_new_tokens": 512, |
| "pass1": 0.62, |
| "eval_examples": 100, |
| "generated_examples": 100, |
| "unit_test_eval": true, |
| "evalplus_used": false, |
| "details_summary": { |
| "passed": 62, |
| "failed": 38, |
| "first_failures": [ |
| { |
| "task_id": 72, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpmzr1cfth/candidate_test.py\", line 18, in <module>\n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 122 |
| }, |
| { |
| "task_id": 77, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpfst93r14/candidate_test.py\", line 16, in <module>\n assert is_Diff(1212112) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 49 |
| }, |
| { |
| "task_id": 137, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpa3ic2h3_/candidate_test.py\", line 17, in <module>\n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 122 |
| }, |
| { |
| "task_id": 138, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmphjayjee1/candidate_test.py\", line 19, in <module>\n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 119 |
| }, |
| { |
| "task_id": 143, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpgho_ozi5/candidate_test.py\", line 15, in <module>\n assert find_lists(([1, 2, 3, 4], [5, 6, 7, 8])) == 2\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 47 |
| } |
| ] |
| }, |
| "gpu": 2, |
| "eval_seconds": 99.525, |
| "pass1_base": 0.68, |
| "pass1_oracle": 0.62, |
| "gap_recovered": 1.0, |
| "target_domain": "code" |
| }, |
| { |
| "cell_id": "C::mbpp_test_held::oracle", |
| "task": "mbpp_test_held", |
| "method": "oracle", |
| "adapter_kind": "oracle", |
| "adapter_dir": "/workspace/round3_out/round4/Y/mbpp_test_held", |
| "max_new_tokens": 512, |
| "pass1": 0.62, |
| "eval_examples": 100, |
| "generated_examples": 100, |
| "unit_test_eval": true, |
| "evalplus_used": false, |
| "details_summary": { |
| "passed": 62, |
| "failed": 38, |
| "first_failures": [ |
| { |
| "task_id": 72, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpodexp81h/candidate_test.py\", line 20, in <module>\n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 158 |
| }, |
| { |
| "task_id": 77, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpxaivmqy9/candidate_test.py\", line 21, in <module>\n assert is_Diff(1212112) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 152 |
| }, |
| { |
| "task_id": 137, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp24kwxxwv/candidate_test.py\", line 19, in <module>\n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 111 |
| }, |
| { |
| "task_id": 138, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp4k1en3yh/candidate_test.py\", line 21, in <module>\n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 176 |
| }, |
| { |
| "task_id": 16, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpbkeaovdv/candidate_test.py\", line 20, in <module>\n assert text_lowercase_underscore(\"Aaab_abbbc\")==(False)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 147 |
| } |
| ] |
| }, |
| "gpu": 1, |
| "eval_seconds": 104.671, |
| "pass1_base": 0.68, |
| "pass1_oracle": 0.62, |
| "gap_recovered": 1.0, |
| "target_domain": "code" |
| }, |
| { |
| "cell_id": "C::mbpp_test_held::topk8_global_ridge", |
| "task": "mbpp_test_held", |
| "method": "topk8_global_ridge", |
| "adapter_kind": "predicted", |
| "adapter_dir": "/workspace/round3_out/round6/Y_pred/mbpp_test_held_topk8_global_ridge_N24_full", |
| "max_new_tokens": 512, |
| "pass1": 0.6, |
| "eval_examples": 100, |
| "generated_examples": 100, |
| "unit_test_eval": true, |
| "evalplus_used": false, |
| "details_summary": { |
| "passed": 60, |
| "failed": 40, |
| "first_failures": [ |
| { |
| "task_id": 72, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpk02lwrma/candidate_test.py\", line 18, in <module>\n assert dif_Square(5) == True\n ^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 110 |
| }, |
| { |
| "task_id": 77, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmp09golsn6/candidate_test.py\", line 19, in <module>\n assert is_Diff (12345) == False\n ^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 140 |
| }, |
| { |
| "task_id": 137, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpablp55k9/candidate_test.py\", line 15, in <module>\n assert math.isclose(zero_count([0, 1, 2, -1, -5, 6, 0, -3, -2, 3, 4, 6, 8]), 0.181818, rel_tol=0.001)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 51 |
| }, |
| { |
| "task_id": 138, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpu8czsutd/candidate_test.py\", line 19, in <module>\n assert is_Sum_Of_Powers_Of_Two(10) == True\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 131 |
| }, |
| { |
| "task_id": 19, |
| "passed": false, |
| "error": "Traceback (most recent call last):\n File \"/tmp/tmpvj4cdzyb/candidate_test.py\", line 15, in <module>\n assert test_duplicate(([1,2,3,4,5]))==False\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nAssertionError\n", |
| "code_chars": 59 |
| } |
| ] |
| }, |
| "gpu": 4, |
| "eval_seconds": 89.27, |
| "pass1_base": 0.68, |
| "pass1_oracle": 0.62, |
| "gap_recovered": 1.3333333333333333, |
| "target_domain": "code" |
| } |
| ] |
| } |