Spaces:
Sleeping
Sleeping
Mirror run4b eval artifacts
Browse files- assets/trained_eval_run4b_8b_sft/eval/trained_eval_rows.csv +151 -0
- assets/trained_eval_run4b_8b_sft/eval/trained_eval_rows.jsonl +150 -0
- assets/trained_eval_run4b_8b_sft/eval/trained_eval_summary.json +52 -0
- assets/trained_eval_run4b_8b_sft/eval/trained_eval_transcripts.md +114 -0
- assets/trained_eval_run4b_8b_sft/training_summary.json +25 -0
assets/trained_eval_run4b_8b_sft/eval/trained_eval_rows.csv
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
agent,seed,case_id,difficulty,reward,primary_reward,auxiliary_reward,contradictions_total,contradictions_triggered,contradictions_surfaced,questions_used,evidence_presented,evidence_timing_successes,blind_evidence_count,useless_questions_ratio,avg_question_length,model_repo,invalid_tool_calls
|
| 2 |
+
random,20260425,timeline_255d67,easy,0.0,0.0,-0.4,1,0,0,6,2,0,2,1.0,5.0,,
|
| 3 |
+
random,20260426,knowledge_b28f8c,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
|
| 4 |
+
random,20260427,workplace_c98377,easy,0.0,0.0,-0.4,1,0,0,3,5,0,5,1.0,5.0,,
|
| 5 |
+
random,20260428,motive_66ff59,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,,
|
| 6 |
+
random,20260429,timeline_19bb78,easy,0.0,0.0,-0.4,1,0,0,4,4,0,4,1.0,5.0,,
|
| 7 |
+
random,20260430,timeline_a97690,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
|
| 8 |
+
random,20260431,alibi_67ffcd,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
|
| 9 |
+
random,20260432,alibi_423bca,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
| 10 |
+
random,20260433,knowledge_960d07,medium,0.0,0.0,-0.4,2,0,0,8,0,0,0,1.0,5.0,,
|
| 11 |
+
random,20260434,alibi_e829c1,easy,0.0,0.0,-0.4,1,0,0,7,1,0,1,1.0,5.0,,
|
| 12 |
+
random,20260435,motive_85e25b,hard,0.0,0.0,-0.4,3,0,0,5,3,0,3,1.0,5.0,,
|
| 13 |
+
random,20260436,knowledge_a599e3,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,,
|
| 14 |
+
random,20260437,motive_8bca20,easy,0.0,0.0,-0.4,1,0,0,6,2,0,2,1.0,5.0,,
|
| 15 |
+
random,20260438,corporate_6b1664,medium,0.0,0.0,-0.4,2,0,0,6,2,0,2,1.0,5.0,,
|
| 16 |
+
random,20260439,alibi_a6c582,easy,0.0,0.0,-0.4,1,0,0,8,0,0,0,1.0,5.0,,
|
| 17 |
+
random,20260440,workplace_835476,easy,0.0,0.0,-0.4,1,0,0,5,3,0,3,1.0,5.0,,
|
| 18 |
+
random,20260441,possession_a079c5,hard,0.0,0.0,-0.4,3,0,0,8,0,0,0,1.0,5.0,,
|
| 19 |
+
random,20260442,possession_9cc45d,hard,0.0,0.0,-0.4,3,0,0,5,3,0,3,1.0,5.0,,
|
| 20 |
+
random,20260443,possession_259aa5,easy,0.0,0.0,-0.4,1,0,0,4,4,0,4,1.0,5.0,,
|
| 21 |
+
random,20260444,corporate_76724c,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
| 22 |
+
random,20260445,timeline_767821,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
| 23 |
+
random,20260446,motive_c0d166,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
| 24 |
+
random,20260447,corporate_307934,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,,
|
| 25 |
+
random,20260448,timeline_592816,hard,0.0,0.0,-0.4,3,0,0,6,2,0,2,1.0,5.0,,
|
| 26 |
+
random,20260449,knowledge_b26824,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
| 27 |
+
random,20260450,knowledge_697785,hard,0.0,0.0,-0.4,3,0,0,7,1,0,1,1.0,5.0,,
|
| 28 |
+
random,20260451,timeline_81dafd,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,,
|
| 29 |
+
random,20260452,corporate_8eb7d7,medium,0.0,0.0,-0.4,2,0,0,4,4,0,4,1.0,5.0,,
|
| 30 |
+
random,20260453,possession_dbb5fe,medium,0.0,0.0,-0.4,2,0,0,5,3,0,3,1.0,5.0,,
|
| 31 |
+
random,20260454,alibi_a4666f,hard,0.0,0.0,-0.4,3,0,0,8,0,0,0,1.0,5.0,,
|
| 32 |
+
keyword_spam,20260425,timeline_255d67,easy,0.030000000000000006,0.0,0.15000000000000002,1,1,0,5,0,0,0,0.6,4.2,,
|
| 33 |
+
keyword_spam,20260426,knowledge_b28f8c,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
|
| 34 |
+
keyword_spam,20260427,workplace_c98377,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,,
|
| 35 |
+
keyword_spam,20260428,motive_66ff59,hard,0.17000000000000004,0.0,0.8500000000000001,3,3,0,5,0,0,0,0.2,4.2,,
|
| 36 |
+
keyword_spam,20260429,timeline_19bb78,easy,0.030000000000000006,0.0,0.15000000000000002,1,1,0,5,0,0,0,0.6,4.2,,
|
| 37 |
+
keyword_spam,20260430,timeline_a97690,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
|
| 38 |
+
keyword_spam,20260431,alibi_67ffcd,medium,0.04000000000000001,0.0,0.20000000000000004,2,1,0,5,0,0,0,0.4,4.2,,
|
| 39 |
+
keyword_spam,20260432,alibi_423bca,medium,0.04000000000000001,0.0,0.20000000000000004,2,1,0,5,0,0,0,0.4,4.2,,
|
| 40 |
+
keyword_spam,20260433,knowledge_960d07,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
|
| 41 |
+
keyword_spam,20260434,alibi_e829c1,easy,0.04000000000000001,0.0,0.20000000000000004,1,1,0,5,0,0,0,0.4,4.2,,
|
| 42 |
+
keyword_spam,20260435,motive_85e25b,hard,0.17000000000000004,0.0,0.8500000000000001,3,3,0,5,0,0,0,0.2,4.2,,
|
| 43 |
+
keyword_spam,20260436,knowledge_a599e3,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
|
| 44 |
+
keyword_spam,20260437,motive_8bca20,easy,0.0,0.0,-0.15000000000000002,1,0,0,5,0,0,0,0.6,4.2,,
|
| 45 |
+
keyword_spam,20260438,corporate_6b1664,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,,
|
| 46 |
+
keyword_spam,20260439,alibi_a6c582,easy,0.04000000000000001,0.0,0.20000000000000004,1,1,0,5,0,0,0,0.4,4.2,,
|
| 47 |
+
keyword_spam,20260440,workplace_835476,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,,
|
| 48 |
+
keyword_spam,20260441,possession_a079c5,hard,0.030000000000000006,0.0,0.15000000000000002,3,1,0,5,0,0,0,0.6,4.2,,
|
| 49 |
+
keyword_spam,20260442,possession_9cc45d,hard,0.030000000000000006,0.0,0.15000000000000002,3,1,0,5,0,0,0,0.6,4.2,,
|
| 50 |
+
keyword_spam,20260443,possession_259aa5,easy,0.0,0.0,-0.2,1,0,0,5,0,0,0,0.8,4.2,,
|
| 51 |
+
keyword_spam,20260444,corporate_76724c,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,,
|
| 52 |
+
keyword_spam,20260445,timeline_767821,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
|
| 53 |
+
keyword_spam,20260446,motive_c0d166,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
|
| 54 |
+
keyword_spam,20260447,corporate_307934,hard,0.020000000000000007,0.0,0.10000000000000003,3,1,0,5,0,0,0,0.8,4.2,,
|
| 55 |
+
keyword_spam,20260448,timeline_592816,hard,0.19,0.0,0.95,3,3,0,5,0,0,0,0.2,4.2,,
|
| 56 |
+
keyword_spam,20260449,knowledge_b26824,medium,0.12000000000000002,0.0,0.6000000000000001,2,2,0,5,0,0,0,0.4,4.2,,
|
| 57 |
+
keyword_spam,20260450,knowledge_697785,hard,0.12000000000000002,0.0,0.6000000000000001,3,2,0,5,0,0,0,0.4,4.2,,
|
| 58 |
+
keyword_spam,20260451,timeline_81dafd,medium,0.13,0.0,0.65,2,2,0,5,0,0,0,0.2,4.2,,
|
| 59 |
+
keyword_spam,20260452,corporate_8eb7d7,medium,0.020000000000000007,0.0,0.10000000000000003,2,1,0,5,0,0,0,0.8,4.2,,
|
| 60 |
+
keyword_spam,20260453,possession_dbb5fe,medium,0.030000000000000006,0.0,0.15000000000000002,2,1,0,5,0,0,0,0.6,4.2,,
|
| 61 |
+
keyword_spam,20260454,alibi_a4666f,hard,0.15000000000000002,0.0,0.75,3,2,0,5,0,0,0,0.2,4.2,,
|
| 62 |
+
present_all,20260425,timeline_255d67,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
|
| 63 |
+
present_all,20260426,knowledge_b28f8c,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
| 64 |
+
present_all,20260427,workplace_c98377,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
|
| 65 |
+
present_all,20260428,motive_66ff59,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
|
| 66 |
+
present_all,20260429,timeline_19bb78,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
|
| 67 |
+
present_all,20260430,timeline_a97690,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
| 68 |
+
present_all,20260431,alibi_67ffcd,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
| 69 |
+
present_all,20260432,alibi_423bca,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
| 70 |
+
present_all,20260433,knowledge_960d07,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
| 71 |
+
present_all,20260434,alibi_e829c1,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,,
|
| 72 |
+
present_all,20260435,motive_85e25b,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
|
| 73 |
+
present_all,20260436,knowledge_a599e3,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
| 74 |
+
present_all,20260437,motive_8bca20,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
|
| 75 |
+
present_all,20260438,corporate_6b1664,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
| 76 |
+
present_all,20260439,alibi_a6c582,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,,
|
| 77 |
+
present_all,20260440,workplace_835476,easy,0.0,0.0,-0.25,1,0,0,0,5,0,5,0.0,0.0,,
|
| 78 |
+
present_all,20260441,possession_a079c5,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
|
| 79 |
+
present_all,20260442,possession_9cc45d,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
|
| 80 |
+
present_all,20260443,possession_259aa5,easy,0.0,0.0,-0.2,1,0,0,0,4,0,4,0.0,0.0,,
|
| 81 |
+
present_all,20260444,corporate_76724c,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
| 82 |
+
present_all,20260445,timeline_767821,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
| 83 |
+
present_all,20260446,motive_c0d166,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
| 84 |
+
present_all,20260447,corporate_307934,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
|
| 85 |
+
present_all,20260448,timeline_592816,hard,0.0,0.0,-0.35000000000000003,3,0,0,0,7,0,7,0.0,0.0,,
|
| 86 |
+
present_all,20260449,knowledge_b26824,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
| 87 |
+
present_all,20260450,knowledge_697785,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
|
| 88 |
+
present_all,20260451,timeline_81dafd,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
| 89 |
+
present_all,20260452,corporate_8eb7d7,medium,0.0,0.0,-0.25,2,0,0,0,5,0,5,0.0,0.0,,
|
| 90 |
+
present_all,20260453,possession_dbb5fe,medium,0.0,0.0,-0.2,2,0,0,0,4,0,4,0.0,0.0,,
|
| 91 |
+
present_all,20260454,alibi_a4666f,hard,0.0,0.0,-0.30000000000000004,3,0,0,0,6,0,6,0.0,0.0,,
|
| 92 |
+
scripted_oracle,20260425,timeline_255d67,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
| 93 |
+
scripted_oracle,20260426,knowledge_b28f8c,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
|
| 94 |
+
scripted_oracle,20260427,workplace_c98377,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
| 95 |
+
scripted_oracle,20260428,motive_66ff59,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
| 96 |
+
scripted_oracle,20260429,timeline_19bb78,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
| 97 |
+
scripted_oracle,20260430,timeline_a97690,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,,
|
| 98 |
+
scripted_oracle,20260431,alibi_67ffcd,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,,
|
| 99 |
+
scripted_oracle,20260432,alibi_423bca,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,,
|
| 100 |
+
scripted_oracle,20260433,knowledge_960d07,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
|
| 101 |
+
scripted_oracle,20260434,alibi_e829c1,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,3.0,,
|
| 102 |
+
scripted_oracle,20260435,motive_85e25b,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
| 103 |
+
scripted_oracle,20260436,knowledge_a599e3,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
|
| 104 |
+
scripted_oracle,20260437,motive_8bca20,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
| 105 |
+
scripted_oracle,20260438,corporate_6b1664,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
|
| 106 |
+
scripted_oracle,20260439,alibi_a6c582,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,3.0,,
|
| 107 |
+
scripted_oracle,20260440,workplace_835476,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
| 108 |
+
scripted_oracle,20260441,possession_a079c5,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
| 109 |
+
scripted_oracle,20260442,possession_9cc45d,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
| 110 |
+
scripted_oracle,20260443,possession_259aa5,easy,0.8800000000000001,1.0,0.4,1,1,1,1,1,1,0,0.0,1.0,,
|
| 111 |
+
scripted_oracle,20260444,corporate_76724c,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
|
| 112 |
+
scripted_oracle,20260445,timeline_767821,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,,
|
| 113 |
+
scripted_oracle,20260446,motive_c0d166,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
|
| 114 |
+
scripted_oracle,20260447,corporate_307934,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
| 115 |
+
scripted_oracle,20260448,timeline_592816,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,,
|
| 116 |
+
scripted_oracle,20260449,knowledge_b26824,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.5,,
|
| 117 |
+
scripted_oracle,20260450,knowledge_697785,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.6666666666666667,,
|
| 118 |
+
scripted_oracle,20260451,timeline_81dafd,medium,0.49000000000000005,0.5,0.45,2,1,1,2,2,1,1,0.0,1.0,,
|
| 119 |
+
scripted_oracle,20260452,corporate_8eb7d7,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
|
| 120 |
+
scripted_oracle,20260453,possession_dbb5fe,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,1.0,,
|
| 121 |
+
scripted_oracle,20260454,alibi_a4666f,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,2.3333333333333335,,
|
| 122 |
+
trained_qwen3_8b_qlora_sft_run4b,20260425,timeline_255d67,easy,0.8400000000000001,1.0,0.2,1,1,1,4,4,1,3,0.25,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 123 |
+
trained_qwen3_8b_qlora_sft_run4b,20260426,knowledge_b28f8c,medium,0.9500000000000001,1.0,0.75,2,2,2,2,3,2,1,0.0,1.5,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 124 |
+
trained_qwen3_8b_qlora_sft_run4b,20260427,workplace_c98377,easy,0.8400000000000001,1.0,0.2,1,1,1,4,4,1,3,0.25,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 125 |
+
trained_qwen3_8b_qlora_sft_run4b,20260428,motive_66ff59,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 126 |
+
trained_qwen3_8b_qlora_sft_run4b,20260429,timeline_19bb78,easy,0.8300000000000001,1.0,0.15000000000000002,1,1,1,4,4,1,3,0.5,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 127 |
+
trained_qwen3_8b_qlora_sft_run4b,20260430,timeline_a97690,medium,0.47000000000000003,0.5,0.35,2,1,1,4,4,1,3,0.0,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 128 |
+
trained_qwen3_8b_qlora_sft_run4b,20260431,alibi_67ffcd,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 129 |
+
trained_qwen3_8b_qlora_sft_run4b,20260432,alibi_423bca,medium,0.9600000000000001,1.0,0.8,2,2,2,2,2,2,0,0.0,3.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 130 |
+
trained_qwen3_8b_qlora_sft_run4b,20260433,knowledge_960d07,medium,0.9500000000000001,1.0,0.7500000000000001,2,2,2,4,4,2,2,0.25,1.25,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 131 |
+
trained_qwen3_8b_qlora_sft_run4b,20260434,alibi_e829c1,easy,0.8200000000000001,1.0,0.09999999999999998,1,1,1,4,4,1,3,0.75,2.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 132 |
+
trained_qwen3_8b_qlora_sft_run4b,20260435,motive_85e25b,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 133 |
+
trained_qwen3_8b_qlora_sft_run4b,20260436,knowledge_a599e3,medium,0.9400000000000001,1.0,0.7000000000000001,2,2,2,2,4,2,2,0.0,1.5,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 134 |
+
trained_qwen3_8b_qlora_sft_run4b,20260437,motive_8bca20,easy,0.8300000000000001,1.0,0.15000000000000002,1,1,1,4,4,1,3,0.5,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 135 |
+
trained_qwen3_8b_qlora_sft_run4b,20260438,corporate_6b1664,medium,0.93,1.0,0.65,2,2,2,4,4,2,2,0.25,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 136 |
+
trained_qwen3_8b_qlora_sft_run4b,20260439,alibi_a6c582,easy,0.8300000000000001,1.0,0.15000000000000002,1,1,1,4,4,1,3,0.5,2.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 137 |
+
trained_qwen3_8b_qlora_sft_run4b,20260440,workplace_835476,easy,0.8400000000000001,1.0,0.2,1,1,1,4,4,1,3,0.25,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 138 |
+
trained_qwen3_8b_qlora_sft_run4b,20260441,possession_a079c5,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 139 |
+
trained_qwen3_8b_qlora_sft_run4b,20260442,possession_9cc45d,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 140 |
+
trained_qwen3_8b_qlora_sft_run4b,20260443,possession_259aa5,easy,0.8400000000000001,1.0,0.20000000000000004,1,1,1,3,3,1,2,0.6666666666666666,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 141 |
+
trained_qwen3_8b_qlora_sft_run4b,20260444,corporate_76724c,medium,0.93,1.0,0.65,2,2,2,4,4,2,2,0.25,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 142 |
+
trained_qwen3_8b_qlora_sft_run4b,20260445,timeline_767821,medium,0.47000000000000003,0.5,0.35,2,1,1,4,4,1,3,0.0,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 143 |
+
trained_qwen3_8b_qlora_sft_run4b,20260446,motive_c0d166,medium,0.9500000000000001,1.0,0.7500000000000001,2,2,2,4,4,2,2,0.25,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 144 |
+
trained_qwen3_8b_qlora_sft_run4b,20260447,corporate_307934,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 145 |
+
trained_qwen3_8b_qlora_sft_run4b,20260448,timeline_592816,hard,1.0,1.0,1.2000000000000002,3,3,3,3,3,3,0,0.0,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 146 |
+
trained_qwen3_8b_qlora_sft_run4b,20260449,knowledge_b26824,medium,0.9400000000000001,1.0,0.7000000000000001,2,2,2,2,4,2,2,0.0,1.5,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 147 |
+
trained_qwen3_8b_qlora_sft_run4b,20260450,knowledge_697785,hard,0.6833333333333333,0.6666666666666666,0.7500000000000001,3,2,2,4,4,2,2,0.25,1.5,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 148 |
+
trained_qwen3_8b_qlora_sft_run4b,20260451,timeline_81dafd,medium,0.47000000000000003,0.5,0.35,2,1,1,4,4,1,3,0.0,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 149 |
+
trained_qwen3_8b_qlora_sft_run4b,20260452,corporate_8eb7d7,medium,0.93,1.0,0.65,2,2,2,4,4,2,2,0.25,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 150 |
+
trained_qwen3_8b_qlora_sft_run4b,20260453,possession_dbb5fe,medium,0.9400000000000001,1.0,0.7,2,2,2,3,3,2,1,0.3333333333333333,1.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
| 151 |
+
trained_qwen3_8b_qlora_sft_run4b,20260454,alibi_a4666f,hard,0.6533333333333333,0.6666666666666666,0.6000000000000001,3,2,2,4,4,2,2,0.5,2.0,heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b,0.0
|
assets/trained_eval_run4b_8b_sft/eval/trained_eval_rows.jsonl
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 1.0}
|
| 2 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260426, "useless_questions_ratio": 1.0}
|
| 3 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 5, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 3, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 1.0}
|
| 4 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260428, "useless_questions_ratio": 1.0}
|
| 5 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260429, "useless_questions_ratio": 1.0}
|
| 6 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260430, "useless_questions_ratio": 1.0}
|
| 7 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260431, "useless_questions_ratio": 1.0}
|
| 8 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260432, "useless_questions_ratio": 1.0}
|
| 9 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260433, "useless_questions_ratio": 1.0}
|
| 10 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260434, "useless_questions_ratio": 1.0}
|
| 11 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260435, "useless_questions_ratio": 1.0}
|
| 12 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260436, "useless_questions_ratio": 1.0}
|
| 13 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 1.0}
|
| 14 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260438, "useless_questions_ratio": 1.0}
|
| 15 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260439, "useless_questions_ratio": 1.0}
|
| 16 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 1.0}
|
| 17 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260441, "useless_questions_ratio": 1.0}
|
| 18 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 1.0}
|
| 19 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 1.0}
|
| 20 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260444, "useless_questions_ratio": 1.0}
|
| 21 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260445, "useless_questions_ratio": 1.0}
|
| 22 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260446, "useless_questions_ratio": 1.0}
|
| 23 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260447, "useless_questions_ratio": 1.0}
|
| 24 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 2, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 2, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 6, "reward": 0.0, "seed": 20260448, "useless_questions_ratio": 1.0}
|
| 25 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260449, "useless_questions_ratio": 1.0}
|
| 26 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 1, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 1, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 7, "reward": 0.0, "seed": 20260450, "useless_questions_ratio": 1.0}
|
| 27 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260451, "useless_questions_ratio": 1.0}
|
| 28 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 4, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 4, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 1.0}
|
| 29 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 3, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260453, "useless_questions_ratio": 1.0}
|
| 30 |
+
{"agent": "random", "auxiliary_reward": -0.4, "avg_question_length": 5.0, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 8, "reward": 0.0, "seed": 20260454, "useless_questions_ratio": 1.0}
|
| 31 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260425, "useless_questions_ratio": 0.6}
|
| 32 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260426, "useless_questions_ratio": 0.4}
|
| 33 |
+
{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 0.8}
|
| 34 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.8500000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.17000000000000004, "seed": 20260428, "useless_questions_ratio": 0.2}
|
| 35 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260429, "useless_questions_ratio": 0.6}
|
| 36 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260430, "useless_questions_ratio": 0.2}
|
| 37 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260431, "useless_questions_ratio": 0.4}
|
| 38 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260432, "useless_questions_ratio": 0.4}
|
| 39 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260433, "useless_questions_ratio": 0.4}
|
| 40 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260434, "useless_questions_ratio": 0.4}
|
| 41 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.8500000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.17000000000000004, "seed": 20260435, "useless_questions_ratio": 0.2}
|
| 42 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260436, "useless_questions_ratio": 0.4}
|
| 43 |
+
{"agent": "keyword_spam", "auxiliary_reward": -0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.6}
|
| 44 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260438, "useless_questions_ratio": 0.8}
|
| 45 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.04000000000000001, "seed": 20260439, "useless_questions_ratio": 0.4}
|
| 46 |
+
{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.8}
|
| 47 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260441, "useless_questions_ratio": 0.6}
|
| 48 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260442, "useless_questions_ratio": 0.6}
|
| 49 |
+
{"agent": "keyword_spam", "auxiliary_reward": -0.2, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 0.8}
|
| 50 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260444, "useless_questions_ratio": 0.8}
|
| 51 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260445, "useless_questions_ratio": 0.2}
|
| 52 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260446, "useless_questions_ratio": 0.2}
|
| 53 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 1, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260447, "useless_questions_ratio": 0.8}
|
| 54 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.95, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.19, "seed": 20260448, "useless_questions_ratio": 0.2}
|
| 55 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260449, "useless_questions_ratio": 0.4}
|
| 56 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.12000000000000002, "seed": 20260450, "useless_questions_ratio": 0.4}
|
| 57 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.65, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.13, "seed": 20260451, "useless_questions_ratio": 0.2}
|
| 58 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.10000000000000003, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.020000000000000007, "seed": 20260452, "useless_questions_ratio": 0.8}
|
| 59 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.030000000000000006, "seed": 20260453, "useless_questions_ratio": 0.6}
|
| 60 |
+
{"agent": "keyword_spam", "auxiliary_reward": 0.75, "avg_question_length": 4.2, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 0, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 5, "reward": 0.15000000000000002, "seed": 20260454, "useless_questions_ratio": 0.2}
|
| 61 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_255d67", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260425, "useless_questions_ratio": 0.0}
|
| 62 |
+
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260426, "useless_questions_ratio": 0.0}
|
| 63 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "workplace_c98377", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260427, "useless_questions_ratio": 0.0}
|
| 64 |
+
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "motive_66ff59", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260428, "useless_questions_ratio": 0.0}
|
| 65 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_19bb78", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260429, "useless_questions_ratio": 0.0}
|
| 66 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_a97690", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260430, "useless_questions_ratio": 0.0}
|
| 67 |
+
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_67ffcd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260431, "useless_questions_ratio": 0.0}
|
| 68 |
+
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_423bca", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260432, "useless_questions_ratio": 0.0}
|
| 69 |
+
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_960d07", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260433, "useless_questions_ratio": 0.0}
|
| 70 |
+
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_e829c1", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260434, "useless_questions_ratio": 0.0}
|
| 71 |
+
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "motive_85e25b", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260435, "useless_questions_ratio": 0.0}
|
| 72 |
+
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_a599e3", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260436, "useless_questions_ratio": 0.0}
|
| 73 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "motive_8bca20", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260437, "useless_questions_ratio": 0.0}
|
| 74 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_6b1664", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260438, "useless_questions_ratio": 0.0}
|
| 75 |
+
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "alibi_a6c582", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260439, "useless_questions_ratio": 0.0}
|
| 76 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "workplace_835476", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260440, "useless_questions_ratio": 0.0}
|
| 77 |
+
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "possession_a079c5", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260441, "useless_questions_ratio": 0.0}
|
| 78 |
+
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "possession_9cc45d", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260442, "useless_questions_ratio": 0.0}
|
| 79 |
+
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "possession_259aa5", "contradictions_surfaced": 0, "contradictions_total": 1, "contradictions_triggered": 0, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260443, "useless_questions_ratio": 0.0}
|
| 80 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_76724c", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260444, "useless_questions_ratio": 0.0}
|
| 81 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_767821", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260445, "useless_questions_ratio": 0.0}
|
| 82 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "motive_c0d166", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260446, "useless_questions_ratio": 0.0}
|
| 83 |
+
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "corporate_307934", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260447, "useless_questions_ratio": 0.0}
|
| 84 |
+
{"agent": "present_all", "auxiliary_reward": -0.35000000000000003, "avg_question_length": 0.0, "blind_evidence_count": 7, "case_id": "timeline_592816", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 7, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260448, "useless_questions_ratio": 0.0}
|
| 85 |
+
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "knowledge_b26824", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260449, "useless_questions_ratio": 0.0}
|
| 86 |
+
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "knowledge_697785", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260450, "useless_questions_ratio": 0.0}
|
| 87 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "timeline_81dafd", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260451, "useless_questions_ratio": 0.0}
|
| 88 |
+
{"agent": "present_all", "auxiliary_reward": -0.25, "avg_question_length": 0.0, "blind_evidence_count": 5, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 5, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260452, "useless_questions_ratio": 0.0}
|
| 89 |
+
{"agent": "present_all", "auxiliary_reward": -0.2, "avg_question_length": 0.0, "blind_evidence_count": 4, "case_id": "possession_dbb5fe", "contradictions_surfaced": 0, "contradictions_total": 2, "contradictions_triggered": 0, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260453, "useless_questions_ratio": 0.0}
|
| 90 |
+
{"agent": "present_all", "auxiliary_reward": -0.30000000000000004, "avg_question_length": 0.0, "blind_evidence_count": 6, "case_id": "alibi_a4666f", "contradictions_surfaced": 0, "contradictions_total": 3, "contradictions_triggered": 0, "difficulty": "hard", "evidence_presented": 6, "evidence_timing_successes": 0, "primary_reward": 0.0, "questions_used": 0, "reward": 0.0, "seed": 20260454, "useless_questions_ratio": 0.0}
|
| 91 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_255d67", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260425, "useless_questions_ratio": 0.0}
|
| 92 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260426, "useless_questions_ratio": 0.0}
|
| 93 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "workplace_c98377", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260427, "useless_questions_ratio": 0.0}
|
| 94 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260428, "useless_questions_ratio": 0.0}
|
| 95 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_19bb78", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260429, "useless_questions_ratio": 0.0}
|
| 96 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_a97690", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260430, "useless_questions_ratio": 0.0}
|
| 97 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260431, "useless_questions_ratio": 0.0}
|
| 98 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260432, "useless_questions_ratio": 0.0}
|
| 99 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_960d07", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260433, "useless_questions_ratio": 0.0}
|
| 100 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_e829c1", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260434, "useless_questions_ratio": 0.0}
|
| 101 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260435, "useless_questions_ratio": 0.0}
|
| 102 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_a599e3", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260436, "useless_questions_ratio": 0.0}
|
| 103 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_8bca20", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260437, "useless_questions_ratio": 0.0}
|
| 104 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_6b1664", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260438, "useless_questions_ratio": 0.0}
|
| 105 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_a6c582", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260439, "useless_questions_ratio": 0.0}
|
| 106 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "workplace_835476", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260440, "useless_questions_ratio": 0.0}
|
| 107 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260441, "useless_questions_ratio": 0.0}
|
| 108 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260442, "useless_questions_ratio": 0.0}
|
| 109 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.4, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_259aa5", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 1, "evidence_timing_successes": 1, "primary_reward": 1.0, "questions_used": 1, "reward": 0.8800000000000001, "seed": 20260443, "useless_questions_ratio": 0.0}
|
| 110 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_76724c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260444, "useless_questions_ratio": 0.0}
|
| 111 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_767821", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260445, "useless_questions_ratio": 0.0}
|
| 112 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_c0d166", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260446, "useless_questions_ratio": 0.0}
|
| 113 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260447, "useless_questions_ratio": 0.0}
|
| 114 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260448, "useless_questions_ratio": 0.0}
|
| 115 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.5, "blind_evidence_count": 0, "case_id": "knowledge_b26824", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260449, "useless_questions_ratio": 0.0}
|
| 116 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.6666666666666667, "blind_evidence_count": 0, "case_id": "knowledge_697785", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260450, "useless_questions_ratio": 0.0}
|
| 117 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.45, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "timeline_81dafd", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 1, "primary_reward": 0.5, "questions_used": 2, "reward": 0.49000000000000005, "seed": 20260451, "useless_questions_ratio": 0.0}
|
| 118 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260452, "useless_questions_ratio": 0.0}
|
| 119 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 0.8, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_dbb5fe", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260453, "useless_questions_ratio": 0.0}
|
| 120 |
+
{"agent": "scripted_oracle", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 2.3333333333333335, "blind_evidence_count": 0, "case_id": "alibi_a4666f", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260454, "useless_questions_ratio": 0.0}
|
| 121 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.2, "avg_question_length": 1.0, "blind_evidence_count": 3, "case_id": "timeline_255d67", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.8400000000000001, "seed": 20260425, "useless_questions_ratio": 0.25}
|
| 122 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.75, "avg_question_length": 1.5, "blind_evidence_count": 1, "case_id": "knowledge_b28f8c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 2, "reward": 0.9500000000000001, "seed": 20260426, "useless_questions_ratio": 0.0}
|
| 123 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.2, "avg_question_length": 1.0, "blind_evidence_count": 3, "case_id": "workplace_c98377", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.8400000000000001, "seed": 20260427, "useless_questions_ratio": 0.25}
|
| 124 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_66ff59", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260428, "useless_questions_ratio": 0.0}
|
| 125 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 3, "case_id": "timeline_19bb78", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.8300000000000001, "seed": 20260429, "useless_questions_ratio": 0.5}
|
| 126 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.35, "avg_question_length": 1.0, "blind_evidence_count": 3, "case_id": "timeline_a97690", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 0.5, "questions_used": 4, "reward": 0.47000000000000003, "seed": 20260430, "useless_questions_ratio": 0.0}
|
| 127 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_67ffcd", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260431, "useless_questions_ratio": 0.0}
|
| 128 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.8, "avg_question_length": 3.0, "blind_evidence_count": 0, "case_id": "alibi_423bca", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 2, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 2, "reward": 0.9600000000000001, "seed": 20260432, "useless_questions_ratio": 0.0}
|
| 129 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.7500000000000001, "avg_question_length": 1.25, "blind_evidence_count": 2, "case_id": "knowledge_960d07", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.9500000000000001, "seed": 20260433, "useless_questions_ratio": 0.25}
|
| 130 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.09999999999999998, "avg_question_length": 2.0, "blind_evidence_count": 3, "case_id": "alibi_e829c1", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.8200000000000001, "seed": 20260434, "useless_questions_ratio": 0.75}
|
| 131 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "motive_85e25b", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260435, "useless_questions_ratio": 0.0}
|
| 132 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.7000000000000001, "avg_question_length": 1.5, "blind_evidence_count": 2, "case_id": "knowledge_a599e3", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 2, "reward": 0.9400000000000001, "seed": 20260436, "useless_questions_ratio": 0.0}
|
| 133 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 3, "case_id": "motive_8bca20", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.8300000000000001, "seed": 20260437, "useless_questions_ratio": 0.5}
|
| 134 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.65, "avg_question_length": 1.0, "blind_evidence_count": 2, "case_id": "corporate_6b1664", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.93, "seed": 20260438, "useless_questions_ratio": 0.25}
|
| 135 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.15000000000000002, "avg_question_length": 2.0, "blind_evidence_count": 3, "case_id": "alibi_a6c582", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.8300000000000001, "seed": 20260439, "useless_questions_ratio": 0.5}
|
| 136 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.2, "avg_question_length": 1.0, "blind_evidence_count": 3, "case_id": "workplace_835476", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 4, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.8400000000000001, "seed": 20260440, "useless_questions_ratio": 0.25}
|
| 137 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_a079c5", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260441, "useless_questions_ratio": 0.0}
|
| 138 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "possession_9cc45d", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260442, "useless_questions_ratio": 0.0}
|
| 139 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.20000000000000004, "avg_question_length": 1.0, "blind_evidence_count": 2, "case_id": "possession_259aa5", "contradictions_surfaced": 1, "contradictions_total": 1, "contradictions_triggered": 1, "difficulty": "easy", "evidence_presented": 3, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 3, "reward": 0.8400000000000001, "seed": 20260443, "useless_questions_ratio": 0.6666666666666666}
|
| 140 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.65, "avg_question_length": 1.0, "blind_evidence_count": 2, "case_id": "corporate_76724c", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.93, "seed": 20260444, "useless_questions_ratio": 0.25}
|
| 141 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.35, "avg_question_length": 1.0, "blind_evidence_count": 3, "case_id": "timeline_767821", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 0.5, "questions_used": 4, "reward": 0.47000000000000003, "seed": 20260445, "useless_questions_ratio": 0.0}
|
| 142 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.7500000000000001, "avg_question_length": 1.0, "blind_evidence_count": 2, "case_id": "motive_c0d166", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.9500000000000001, "seed": 20260446, "useless_questions_ratio": 0.25}
|
| 143 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "corporate_307934", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260447, "useless_questions_ratio": 0.0}
|
| 144 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 1.2000000000000002, "avg_question_length": 1.0, "blind_evidence_count": 0, "case_id": "timeline_592816", "contradictions_surfaced": 3, "contradictions_total": 3, "contradictions_triggered": 3, "difficulty": "hard", "evidence_presented": 3, "evidence_timing_successes": 3, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 3, "reward": 1.0, "seed": 20260448, "useless_questions_ratio": 0.0}
|
| 145 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.7000000000000001, "avg_question_length": 1.5, "blind_evidence_count": 2, "case_id": "knowledge_b26824", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 2, "reward": 0.9400000000000001, "seed": 20260449, "useless_questions_ratio": 0.0}
|
| 146 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.7500000000000001, "avg_question_length": 1.5, "blind_evidence_count": 2, "case_id": "knowledge_697785", "contradictions_surfaced": 2, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 4, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 0.6666666666666666, "questions_used": 4, "reward": 0.6833333333333333, "seed": 20260450, "useless_questions_ratio": 0.25}
|
| 147 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.35, "avg_question_length": 1.0, "blind_evidence_count": 3, "case_id": "timeline_81dafd", "contradictions_surfaced": 1, "contradictions_total": 2, "contradictions_triggered": 1, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 1, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 0.5, "questions_used": 4, "reward": 0.47000000000000003, "seed": 20260451, "useless_questions_ratio": 0.0}
|
| 148 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.65, "avg_question_length": 1.0, "blind_evidence_count": 2, "case_id": "corporate_8eb7d7", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 4, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 4, "reward": 0.93, "seed": 20260452, "useless_questions_ratio": 0.25}
|
| 149 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.7, "avg_question_length": 1.0, "blind_evidence_count": 1, "case_id": "possession_dbb5fe", "contradictions_surfaced": 2, "contradictions_total": 2, "contradictions_triggered": 2, "difficulty": "medium", "evidence_presented": 3, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 1.0, "questions_used": 3, "reward": 0.9400000000000001, "seed": 20260453, "useless_questions_ratio": 0.3333333333333333}
|
| 150 |
+
{"agent": "trained_qwen3_8b_qlora_sft_run4b", "auxiliary_reward": 0.6000000000000001, "avg_question_length": 2.0, "blind_evidence_count": 2, "case_id": "alibi_a4666f", "contradictions_surfaced": 2, "contradictions_total": 3, "contradictions_triggered": 2, "difficulty": "hard", "evidence_presented": 4, "evidence_timing_successes": 2, "invalid_tool_calls": 0, "model_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b", "primary_reward": 0.6666666666666666, "questions_used": 4, "reward": 0.6533333333333333, "seed": 20260454, "useless_questions_ratio": 0.5}
|
assets/trained_eval_run4b_8b_sft/eval/trained_eval_summary.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent": "keyword_spam",
|
| 4 |
+
"avg_evidence_timing": 0,
|
| 5 |
+
"avg_primary_reward": 0.0,
|
| 6 |
+
"avg_reward": 0.07300000000000001,
|
| 7 |
+
"avg_surface_rate": 0.0,
|
| 8 |
+
"avg_trigger_rate": 0.6777777777777778,
|
| 9 |
+
"avg_useless_ratio": 0.48000000000000004,
|
| 10 |
+
"episodes": 30
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"agent": "present_all",
|
| 14 |
+
"avg_evidence_timing": 0,
|
| 15 |
+
"avg_primary_reward": 0.0,
|
| 16 |
+
"avg_reward": 0.0,
|
| 17 |
+
"avg_surface_rate": 0.0,
|
| 18 |
+
"avg_trigger_rate": 0.0,
|
| 19 |
+
"avg_useless_ratio": 0.0,
|
| 20 |
+
"episodes": 30
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"agent": "random",
|
| 24 |
+
"avg_evidence_timing": 0,
|
| 25 |
+
"avg_primary_reward": 0.0,
|
| 26 |
+
"avg_reward": 0.0,
|
| 27 |
+
"avg_surface_rate": 0.0,
|
| 28 |
+
"avg_trigger_rate": 0.0,
|
| 29 |
+
"avg_useless_ratio": 1.0,
|
| 30 |
+
"episodes": 30
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"agent": "scripted_oracle",
|
| 34 |
+
"avg_evidence_timing": 1.9,
|
| 35 |
+
"avg_primary_reward": 0.95,
|
| 36 |
+
"avg_reward": 0.9023333333333334,
|
| 37 |
+
"avg_surface_rate": 0.95,
|
| 38 |
+
"avg_trigger_rate": 0.95,
|
| 39 |
+
"avg_useless_ratio": 0.0,
|
| 40 |
+
"episodes": 30
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"agent": "trained_qwen3_8b_qlora_sft_run4b",
|
| 44 |
+
"avg_evidence_timing": 1.8333333333333333,
|
| 45 |
+
"avg_primary_reward": 0.9277777777777778,
|
| 46 |
+
"avg_reward": 0.8598888888888889,
|
| 47 |
+
"avg_surface_rate": 0.9277777777777778,
|
| 48 |
+
"avg_trigger_rate": 0.9277777777777778,
|
| 49 |
+
"avg_useless_ratio": 0.2,
|
| 50 |
+
"episodes": 30
|
| 51 |
+
}
|
| 52 |
+
]
|
assets/trained_eval_run4b_8b_sft/eval/trained_eval_transcripts.md
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Agent: trained_qwen3_8b_qlora_sft_run4b
|
| 2 |
+
|
| 3 |
+
## timeline_255d67 (easy)
|
| 4 |
+
|
| 5 |
+
Esme Dubois is accused of assault at the public library on Walnut Street on May 28th. The victim claims the assault occurred at 11:45 PM. The defendant claims they were elsewhere at that time.
|
| 6 |
+
|
| 7 |
+
### Transcript
|
| 8 |
+
- Step 1 `ask_question`: time?
|
| 9 |
+
- Witness: The assault happened at 11:00 PM, not 11:45 PM.
|
| 10 |
+
- Triggered: c1_time_of_assault; Surfaced: none
|
| 11 |
+
- Step 2 `present_evidence`: surveillance_timestamp
|
| 12 |
+
- Witness: [Witness stammers] I... I'm not sure what to say.
|
| 13 |
+
- Triggered: c1_time_of_assault; Surfaced: c1_time_of_assault
|
| 14 |
+
- Step 3 `ask_question`: where?
|
| 15 |
+
- Witness: I don't recall.
|
| 16 |
+
- Triggered: c1_time_of_assault; Surfaced: c1_time_of_assault
|
| 17 |
+
- Step 4 `present_evidence`: phone_records
|
| 18 |
+
- Witness: [Witness] I have no comment on that exhibit.
|
| 19 |
+
- Triggered: c1_time_of_assault; Surfaced: c1_time_of_assault
|
| 20 |
+
- Step 5 `ask_question`: motive?
|
| 21 |
+
- Witness: I had no reason to hurt anyone.
|
| 22 |
+
- Triggered: c1_time_of_assault; Surfaced: c1_time_of_assault
|
| 23 |
+
- Step 6 `present_evidence`: motive_email
|
| 24 |
+
- Witness: [Witness] I have no comment on that exhibit.
|
| 25 |
+
- Triggered: c1_time_of_assault; Surfaced: c1_time_of_assault
|
| 26 |
+
- Step 7 `ask_question`: alibi?
|
| 27 |
+
- Witness: I was at home at 11:00 PM. I have witnesses.
|
| 28 |
+
- Triggered: c1_time_of_assault; Surfaced: c1_time_of_assault
|
| 29 |
+
- Step 8 `present_evidence`: witness_alibi
|
| 30 |
+
- Witness: [Witness] I have no comment on that exhibit.
|
| 31 |
+
- Triggered: c1_time_of_assault; Surfaced: c1_time_of_assault
|
| 32 |
+
- Step 9 `rest_case`: rest_case
|
| 33 |
+
- Witness: [Counsel rests.]
|
| 34 |
+
- Triggered: c1_time_of_assault; Surfaced: c1_time_of_assault
|
| 35 |
+
|
| 36 |
+
### Reward
|
| 37 |
+
```json
|
| 38 |
+
{"primary_reward": 1.0, "auxiliary_reward_raw": 0.2, "total_reward": 0.8400000000000001, "contradictions_total": 1.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 1.0, "keyword_questions": 1.0, "evidence_timing_successes": 1.0, "blind_evidence_count": 3.0, "duplicate_questions": 0.0, "irrelevant_questions": 1.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.25, "avg_question_length": 1.0}
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
---
|
| 42 |
+
|
| 43 |
+
# Agent: trained_qwen3_8b_qlora_sft_run4b
|
| 44 |
+
|
| 45 |
+
## knowledge_b28f8c (medium)
|
| 46 |
+
|
| 47 |
+
Priya Sharma is charged with embezzlement from the diner at the corner of 9th and Broad. The state alleges Priya Sharma systematically diverted funds in coordination with the manager, Kavita Iyer, between December 5th and the present. The defendant denies knowing Kavita Iyer.
|
| 48 |
+
|
| 49 |
+
### Transcript
|
| 50 |
+
- Step 1 `ask_question`: know?
|
| 51 |
+
- Witness: I have never met Kavita Iyer in my life.
|
| 52 |
+
- Triggered: c1_know_victim; Surfaced: none
|
| 53 |
+
- Step 2 `present_evidence`: email_chain
|
| 54 |
+
- Witness: [Witness stammers] I... I'm not sure what to say.
|
| 55 |
+
- Triggered: c1_know_victim; Surfaced: c1_know_victim
|
| 56 |
+
- Step 3 `ask_question`: been to?
|
| 57 |
+
- Witness: I've never set foot in the diner at the corner of 9th and Broad.
|
| 58 |
+
- Triggered: c1_know_victim, c2_place_visit; Surfaced: c1_know_victim
|
| 59 |
+
- Step 4 `present_evidence`: visitor_log
|
| 60 |
+
- Witness: [Witness stammers] I... I'm not sure what to say.
|
| 61 |
+
- Triggered: c1_know_victim, c2_place_visit; Surfaced: c1_know_victim, c2_place_visit
|
| 62 |
+
- Step 5 `present_evidence`: meeting_notes
|
| 63 |
+
- Witness: [Witness] I have no comment on that exhibit.
|
| 64 |
+
- Triggered: c1_know_victim, c2_place_visit; Surfaced: c1_know_victim, c2_place_visit
|
| 65 |
+
- Step 6 `rest_case`: rest_case
|
| 66 |
+
- Witness: [Counsel rests.]
|
| 67 |
+
- Triggered: c1_know_victim, c2_place_visit; Surfaced: c1_know_victim, c2_place_visit
|
| 68 |
+
|
| 69 |
+
### Reward
|
| 70 |
+
```json
|
| 71 |
+
{"primary_reward": 1.0, "auxiliary_reward_raw": 0.75, "total_reward": 0.9500000000000001, "contradictions_total": 2.0, "contradictions_triggered": 2.0, "contradictions_surfaced": 2.0, "keyword_questions": 2.0, "evidence_timing_successes": 2.0, "blind_evidence_count": 1.0, "duplicate_questions": 0.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.0, "avg_question_length": 1.5}
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
# Agent: trained_qwen3_8b_qlora_sft_run4b
|
| 77 |
+
|
| 78 |
+
## workplace_c98377 (easy)
|
| 79 |
+
|
| 80 |
+
Priya Sharma is questioned in an HR investigation after Ravi Joshi reported retaliation at the HR office on March 14th. The manager denies sending threatening messages or being nearby.
|
| 81 |
+
|
| 82 |
+
### Transcript
|
| 83 |
+
- Step 1 `ask_question`: message?
|
| 84 |
+
- Witness: I never sent Ravi Joshi any threatening message.
|
| 85 |
+
- Triggered: c1_message_denial; Surfaced: none
|
| 86 |
+
- Step 2 `present_evidence`: message_export
|
| 87 |
+
- Witness: [Witness stammers] I... I'm not sure what to say.
|
| 88 |
+
- Triggered: c1_message_denial; Surfaced: c1_message_denial
|
| 89 |
+
- Step 3 `ask_question`: where?
|
| 90 |
+
- Witness: I was not near the HR office that day.
|
| 91 |
+
- Triggered: c1_message_denial; Surfaced: c1_message_denial
|
| 92 |
+
- Step 4 `present_evidence`: badge_scan
|
| 93 |
+
- Witness: [Witness] I have no comment on that exhibit.
|
| 94 |
+
- Triggered: c1_message_denial; Surfaced: c1_message_denial
|
| 95 |
+
- Step 5 `ask_question`: warning?
|
| 96 |
+
- Witness: HR never warned me about retaliation.
|
| 97 |
+
- Triggered: c1_message_denial; Surfaced: c1_message_denial
|
| 98 |
+
- Step 6 `present_evidence`: hr_warning_memo
|
| 99 |
+
- Witness: [Witness] I have no comment on that exhibit.
|
| 100 |
+
- Triggered: c1_message_denial; Surfaced: c1_message_denial
|
| 101 |
+
- Step 7 `ask_question`: where?
|
| 102 |
+
- Witness: You've already asked that question.
|
| 103 |
+
- Triggered: c1_message_denial; Surfaced: c1_message_denial
|
| 104 |
+
- Step 8 `present_evidence`: security_clip
|
| 105 |
+
- Witness: [Witness] I have no comment on that exhibit.
|
| 106 |
+
- Triggered: c1_message_denial; Surfaced: c1_message_denial
|
| 107 |
+
- Step 9 `rest_case`: rest_case
|
| 108 |
+
- Witness: [Counsel rests.]
|
| 109 |
+
- Triggered: c1_message_denial; Surfaced: c1_message_denial
|
| 110 |
+
|
| 111 |
+
### Reward
|
| 112 |
+
```json
|
| 113 |
+
{"primary_reward": 1.0, "auxiliary_reward_raw": 0.2, "total_reward": 0.8400000000000001, "contradictions_total": 1.0, "contradictions_triggered": 1.0, "contradictions_surfaced": 1.0, "keyword_questions": 1.0, "evidence_timing_successes": 1.0, "blind_evidence_count": 3.0, "duplicate_questions": 1.0, "irrelevant_questions": 0.0, "inadmissible_actions": 0.0, "useless_questions_ratio": 0.25, "avg_question_length": 1.0}
|
| 114 |
+
```
|
assets/trained_eval_run4b_8b_sft/training_summary.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"artifact_repo": "heavycoderhh/counsel-env-qwen3-8b-qlora-sft-run4b",
|
| 3 |
+
"base_model": "Qwen/Qwen3-8B",
|
| 4 |
+
"gradient_accumulation_steps": 4,
|
| 5 |
+
"include_rest_rows": false,
|
| 6 |
+
"lora_alpha": 32,
|
| 7 |
+
"lora_dropout": 0.05,
|
| 8 |
+
"lora_r": 16,
|
| 9 |
+
"max_sft_length": 1536,
|
| 10 |
+
"metrics": {
|
| 11 |
+
"epoch": 0.6027397260273972,
|
| 12 |
+
"total_flos": 3.105680873988096e+16,
|
| 13 |
+
"train_loss": 0.05649940988699779,
|
| 14 |
+
"train_runtime": 1287.6923,
|
| 15 |
+
"train_samples_per_second": 0.683,
|
| 16 |
+
"train_steps_per_second": 0.171
|
| 17 |
+
},
|
| 18 |
+
"recipe": "qwen3_8b_qlora_oracle_sft",
|
| 19 |
+
"sft_case_count": 480,
|
| 20 |
+
"sft_epochs": 1.0,
|
| 21 |
+
"sft_learning_rate": 0.0001,
|
| 22 |
+
"sft_max_steps": 220,
|
| 23 |
+
"sft_row_count": 1460,
|
| 24 |
+
"space_repo": "heavycoderhh/counsel-env"
|
| 25 |
+
}
|