ThomasTheMaker commited on
Commit
a9ca72e
·
verified ·
1 Parent(s): 40a1363

Upload evaluation/Qwen_Qwen3_0.6B_leaderboard|truthfulqa:mc|0|0_20250729_131709.json with huggingface_hub

Browse files
evaluation/Qwen_Qwen3_0.6B_leaderboard|truthfulqa:mc|0|0_20250729_131709.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "model_name": "Qwen/Qwen3-0.6B",
4
+ "test_name": "leaderboard|truthfulqa:mc|0|0",
5
+ "evaluation_timestamp": "2025-07-29T13:17:09.801306",
6
+ "lighteval_version": "latest",
7
+ "evaluation_type": "automated"
8
+ },
9
+ "results": {
10
+ "config_general": {
11
+ "lighteval_sha": "?",
12
+ "num_fewshot_seeds": 1,
13
+ "max_samples": 10,
14
+ "job_id": 0,
15
+ "start_time": 5765.815531393,
16
+ "end_time": 5838.737527639,
17
+ "total_evaluation_time_secondes": "72.92199624600016",
18
+ "model_name": "Qwen/Qwen3-0.6B",
19
+ "model_sha": "c1899de289a04d12100db370d81485cdf75e47ca",
20
+ "model_dtype": null,
21
+ "model_size": "2.22 GB",
22
+ "generation_parameters": {
23
+ "early_stopping": null,
24
+ "repetition_penalty": null,
25
+ "frequency_penalty": null,
26
+ "length_penalty": null,
27
+ "presence_penalty": null,
28
+ "max_new_tokens": null,
29
+ "min_new_tokens": null,
30
+ "seed": null,
31
+ "stop_tokens": null,
32
+ "temperature": null,
33
+ "top_k": null,
34
+ "min_p": null,
35
+ "top_p": null,
36
+ "truncate_prompt": null,
37
+ "response_format": null
38
+ }
39
+ },
40
+ "results": {
41
+ "leaderboard|truthfulqa:mc|0": {
42
+ "truthfulqa_mc1": 0.4,
43
+ "truthfulqa_mc1_stderr": 0.16329931618554522,
44
+ "truthfulqa_mc2": 0.38767522181090863,
45
+ "truthfulqa_mc2_stderr": 0.13021084864495217
46
+ },
47
+ "all": {
48
+ "truthfulqa_mc1": 0.4,
49
+ "truthfulqa_mc1_stderr": 0.16329931618554522,
50
+ "truthfulqa_mc2": 0.38767522181090863,
51
+ "truthfulqa_mc2_stderr": 0.13021084864495217
52
+ }
53
+ },
54
+ "versions": {
55
+ "leaderboard|truthfulqa:mc|0": 0
56
+ },
57
+ "config_tasks": {
58
+ "leaderboard|truthfulqa:mc": {
59
+ "name": "truthfulqa:mc",
60
+ "prompt_function": "truthful_qa_multiple_choice",
61
+ "hf_repo": "truthful_qa",
62
+ "hf_subset": "multiple_choice",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "truthfulqa_mc1",
67
+ "truthfulqa_mc2"
68
+ ],
69
+ "higher_is_better": {
70
+ "truthfulqa_mc1": true,
71
+ "truthfulqa_mc2": true
72
+ },
73
+ "category": "8",
74
+ "use_case": "1",
75
+ "sample_level_fn": "truthfulqa_mc_metrics",
76
+ "corpus_level_fn": {
77
+ "truthfulqa_mc1": "mean",
78
+ "truthfulqa_mc2": "mean"
79
+ }
80
+ }
81
+ ],
82
+ "hf_revision": null,
83
+ "hf_filter": null,
84
+ "hf_avail_splits": [
85
+ "validation"
86
+ ],
87
+ "trust_dataset": true,
88
+ "evaluation_splits": [
89
+ "validation"
90
+ ],
91
+ "few_shots_split": null,
92
+ "few_shots_select": null,
93
+ "generation_size": -1,
94
+ "generation_grammar": null,
95
+ "stop_sequence": [
96
+ "\n"
97
+ ],
98
+ "num_samples": null,
99
+ "suite": [
100
+ "leaderboard"
101
+ ],
102
+ "original_num_docs": 817,
103
+ "effective_num_docs": 10,
104
+ "must_remove_duplicate_docs": false,
105
+ "version": 0
106
+ }
107
+ },
108
+ "summary_tasks": {
109
+ "leaderboard|truthfulqa:mc|0": {
110
+ "hashes": {
111
+ "hash_examples": "da0474ee913e995c",
112
+ "hash_full_prompts": "da0474ee913e995c",
113
+ "hash_input_tokens": "e91edf9d841d2695",
114
+ "hash_cont_tokens": "6c9c980152091c7a"
115
+ },
116
+ "truncated": 0,
117
+ "non_truncated": 10,
118
+ "padded": 101,
119
+ "non_padded": 18,
120
+ "effective_few_shots": 0.0,
121
+ "num_truncated_few_shots": 0
122
+ }
123
+ },
124
+ "summary_general": {
125
+ "hashes": {
126
+ "hash_examples": "d93946e268e80920",
127
+ "hash_full_prompts": "d93946e268e80920",
128
+ "hash_input_tokens": "2653f11512a69452",
129
+ "hash_cont_tokens": "899b57bbe39db181"
130
+ },
131
+ "truncated": 0,
132
+ "non_truncated": 10,
133
+ "padded": 101,
134
+ "non_padded": 18,
135
+ "num_truncated_few_shots": 0
136
+ }
137
+ }
138
+ }