ThomasTheMaker commited on
Commit
9cebd16
·
verified ·
1 Parent(s): b9afca8

Upload evaluation/HuggingFaceTB_SmolLM_135M_Instruct_leaderboard|truthfulqa:mc|0|0_20250802_154459.json with huggingface_hub

Browse files
evaluation/HuggingFaceTB_SmolLM_135M_Instruct_leaderboard|truthfulqa:mc|0|0_20250802_154459.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "model_name": "HuggingFaceTB/SmolLM-135M-Instruct",
4
+ "test_name": "leaderboard|truthfulqa:mc|0|0",
5
+ "evaluation_timestamp": "2025-08-02T15:44:59.518126",
6
+ "lighteval_version": "latest",
7
+ "evaluation_type": "automated"
8
+ },
9
+ "results": {
10
+ "config_general": {
11
+ "lighteval_sha": "?",
12
+ "num_fewshot_seeds": 1,
13
+ "max_samples": 10,
14
+ "job_id": 0,
15
+ "start_time": 360218.017320285,
16
+ "end_time": 360307.915306828,
17
+ "total_evaluation_time_secondes": "89.89798654295737",
18
+ "model_name": "HuggingFaceTB/SmolLM-135M-Instruct",
19
+ "model_sha": "fcc320f490e08fdb4b99d935b2c58d40bf35b0d0",
20
+ "model_dtype": null,
21
+ "model_size": "513.13 MB",
22
+ "generation_parameters": {
23
+ "early_stopping": null,
24
+ "repetition_penalty": null,
25
+ "frequency_penalty": null,
26
+ "length_penalty": null,
27
+ "presence_penalty": null,
28
+ "max_new_tokens": null,
29
+ "min_new_tokens": null,
30
+ "seed": null,
31
+ "stop_tokens": null,
32
+ "temperature": null,
33
+ "top_k": null,
34
+ "min_p": null,
35
+ "top_p": null,
36
+ "truncate_prompt": null,
37
+ "response_format": null
38
+ }
39
+ },
40
+ "results": {
41
+ "leaderboard|truthfulqa:mc|0": {
42
+ "truthfulqa_mc1": 0.4,
43
+ "truthfulqa_mc1_stderr": 0.16329931618554522,
44
+ "truthfulqa_mc2": 0.5128856230898219,
45
+ "truthfulqa_mc2_stderr": 0.15330696993692186
46
+ },
47
+ "all": {
48
+ "truthfulqa_mc1": 0.4,
49
+ "truthfulqa_mc1_stderr": 0.16329931618554522,
50
+ "truthfulqa_mc2": 0.5128856230898219,
51
+ "truthfulqa_mc2_stderr": 0.15330696993692186
52
+ }
53
+ },
54
+ "versions": {
55
+ "leaderboard|truthfulqa:mc|0": 0
56
+ },
57
+ "config_tasks": {
58
+ "leaderboard|truthfulqa:mc": {
59
+ "name": "truthfulqa:mc",
60
+ "prompt_function": "truthful_qa_multiple_choice",
61
+ "hf_repo": "truthful_qa",
62
+ "hf_subset": "multiple_choice",
63
+ "metric": [
64
+ {
65
+ "metric_name": [
66
+ "truthfulqa_mc1",
67
+ "truthfulqa_mc2"
68
+ ],
69
+ "higher_is_better": {
70
+ "truthfulqa_mc1": true,
71
+ "truthfulqa_mc2": true
72
+ },
73
+ "category": "8",
74
+ "use_case": "1",
75
+ "sample_level_fn": "truthfulqa_mc_metrics",
76
+ "corpus_level_fn": {
77
+ "truthfulqa_mc1": "mean",
78
+ "truthfulqa_mc2": "mean"
79
+ }
80
+ }
81
+ ],
82
+ "hf_revision": null,
83
+ "hf_filter": null,
84
+ "hf_avail_splits": [
85
+ "validation"
86
+ ],
87
+ "trust_dataset": true,
88
+ "evaluation_splits": [
89
+ "validation"
90
+ ],
91
+ "few_shots_split": null,
92
+ "few_shots_select": null,
93
+ "generation_size": -1,
94
+ "generation_grammar": null,
95
+ "stop_sequence": [
96
+ "\n"
97
+ ],
98
+ "num_samples": null,
99
+ "suite": [
100
+ "leaderboard"
101
+ ],
102
+ "original_num_docs": 817,
103
+ "effective_num_docs": 10,
104
+ "must_remove_duplicate_docs": false,
105
+ "version": 0
106
+ }
107
+ },
108
+ "summary_tasks": {
109
+ "leaderboard|truthfulqa:mc|0": {
110
+ "hashes": {
111
+ "hash_examples": "da0474ee913e995c",
112
+ "hash_full_prompts": "da0474ee913e995c",
113
+ "hash_input_tokens": "9978174ca3a146f7",
114
+ "hash_cont_tokens": "b70859daed5a0d89"
115
+ },
116
+ "truncated": 0,
117
+ "non_truncated": 10,
118
+ "padded": 110,
119
+ "non_padded": 9,
120
+ "effective_few_shots": 0.0,
121
+ "num_truncated_few_shots": 0
122
+ }
123
+ },
124
+ "summary_general": {
125
+ "hashes": {
126
+ "hash_examples": "d93946e268e80920",
127
+ "hash_full_prompts": "d93946e268e80920",
128
+ "hash_input_tokens": "5438465c3ac07759",
129
+ "hash_cont_tokens": "9d720243d430bdc6"
130
+ },
131
+ "truncated": 0,
132
+ "non_truncated": 10,
133
+ "padded": 110,
134
+ "non_padded": 9,
135
+ "num_truncated_few_shots": 0
136
+ }
137
+ }
138
+ }