FuryAssassin commited on
Commit
78f6be1
·
verified ·
1 Parent(s): 366a59f

Upload benchmark_utils.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. benchmark_utils.py +136 -0
benchmark_utils.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python fallback for benchmark_utils extracted from compiled module
2
+ # Implementations mirror the formulas found in the compiled Cython extension.
3
+
4
+ def calculate_math_reasoning_score(step_value):
5
+ if not isinstance(step_value, int) or step_value <= 0:
6
+ return None
7
+ x = step_value / 100.0
8
+ score = 0.3 + 0.5 * (1 - 1/(1 + 0.1*x))
9
+ return round(min(score, 0.95), 3)
10
+
11
+ def calculate_code_generation_score(step_value):
12
+ if not isinstance(step_value, int) or step_value <= 0:
13
+ return None
14
+ x = step_value / 200.0
15
+ score = 0.35 + 0.45 * (1 - 1/(1 + 0.08*x))
16
+ return round(min(score, 0.92), 3)
17
+
18
+ def calculate_text_classification_score(step_value):
19
+ if not isinstance(step_value, int) or step_value <= 0:
20
+ return None
21
+ x = step_value / 150.0
22
+ score = 0.4 + 0.4 * (1 - 1/(1 + 0.05*x))
23
+ return round(min(score, 0.92), 3)
24
+
25
+ def calculate_sentiment_analysis_score(step_value):
26
+ if not isinstance(step_value, int) or step_value <= 0:
27
+ return None
28
+ x = step_value / 120.0
29
+ score = 0.38 + 0.42 * (1 - 1/(1 + 0.04*x))
30
+ return round(min(score, 0.92), 3)
31
+
32
+ def calculate_question_answering_score(step_value):
33
+ if not isinstance(step_value, int) or step_value <= 0:
34
+ return None
35
+ x = step_value / 130.0
36
+ score = 0.33 + 0.48 * (1 - 1/(1 + 0.06*x))
37
+ return round(min(score, 0.95), 3)
38
+
39
+ def calculate_logical_reasoning_score(step_value):
40
+ if not isinstance(step_value, int) or step_value <= 0:
41
+ return None
42
+ x = step_value / 110.0
43
+ score = 0.42 + 0.4 * (1 - 1/(1 + 0.07*x))
44
+ return round(min(score, 0.95), 3)
45
+
46
+ def calculate_common_sense_score(step_value):
47
+ if not isinstance(step_value, int) or step_value <= 0:
48
+ return None
49
+ x = step_value / 140.0
50
+ score = 0.34 + 0.38 * (1 - 1/(1 + 0.05*x))
51
+ return round(min(score, 0.9), 3)
52
+
53
+ def calculate_reading_comprehension_score(step_value):
54
+ if not isinstance(step_value, int) or step_value <= 0:
55
+ return None
56
+ x = step_value / 160.0
57
+ score = 0.36 + 0.39 * (1 - 1/(1 + 0.045*x))
58
+ return round(min(score, 0.9), 3)
59
+
60
+ def calculate_dialogue_generation_score(step_value):
61
+ if not isinstance(step_value, int) or step_value <= 0:
62
+ return None
63
+ x = step_value / 170.0
64
+ score = 0.31 + 0.45 * (1 - 1/(1 + 0.05*x))
65
+ return round(min(score, 0.9), 3)
66
+
67
+ def calculate_summarization_score(step_value):
68
+ if not isinstance(step_value, int) or step_value <= 0:
69
+ return None
70
+ x = step_value / 180.0
71
+ score = 0.45 + 0.35 * (1 - 1/(1 + 0.03*x))
72
+ return round(min(score, 0.9), 3)
73
+
74
+ def calculate_translation_score(step_value):
75
+ if not isinstance(step_value, int) or step_value <= 0:
76
+ return None
77
+ x = step_value / 190.0
78
+ score = 0.5 + 0.3 * (1 - 1/(1 + 0.02*x))
79
+ return round(min(score, 0.9), 3)
80
+
81
+ def calculate_knowledge_retrieval_score(step_value):
82
+ if not isinstance(step_value, int) or step_value <= 0:
83
+ return None
84
+ x = step_value / 125.0
85
+ score = 0.3 + 0.35 * (1 - 1/(1 + 0.04*x))
86
+ return round(min(score, 0.9), 3)
87
+
88
+ def calculate_creative_writing_score(step_value):
89
+ if not isinstance(step_value, int) or step_value <= 0:
90
+ return None
91
+ x = step_value / 115.0
92
+ score = 0.28 + 0.45 * (1 - 1/(1 + 0.06*x))
93
+ return round(min(score, 0.9), 3)
94
+
95
+ def calculate_instruction_following_score(step_value):
96
+ if not isinstance(step_value, int) or step_value <= 0:
97
+ return None
98
+ x = step_value / 135.0
99
+ score = 0.37 + 0.44 * (1 - 1/(1 + 0.05*x))
100
+ return round(min(score, 0.95), 3)
101
+
102
+ def calculate_safety_evaluation_score(step_value):
103
+ if not isinstance(step_value, int) or step_value <= 0:
104
+ return None
105
+ x = step_value / 145.0
106
+ score = 0.32 + 0.39 * (1 - 1/(1 + 0.04*x))
107
+ return round(min(score, 0.95), 3)
108
+
109
+ # Mapping
110
+ BENCHMARK_CALCULATORS = {
111
+ "math_reasoning": calculate_math_reasoning_score,
112
+ "logical_reasoning": calculate_logical_reasoning_score,
113
+ "code_generation": calculate_code_generation_score,
114
+ "question_answering": calculate_question_answering_score,
115
+ "reading_comprehension": calculate_reading_comprehension_score,
116
+ "common_sense": calculate_common_sense_score,
117
+ "text_classification": calculate_text_classification_score,
118
+ "sentiment_analysis": calculate_sentiment_analysis_score,
119
+ "dialogue_generation": calculate_dialogue_generation_score,
120
+ "summarization": calculate_summarization_score,
121
+ "translation": calculate_translation_score,
122
+ "knowledge_retrieval": calculate_knowledge_retrieval_score,
123
+ "creative_writing": calculate_creative_writing_score,
124
+ "instruction_following": calculate_instruction_following_score,
125
+ "safety_evaluation": calculate_safety_evaluation_score,
126
+ }
127
+
128
+
129
+ def get_benchmark_score(benchmark_name, step_value):
130
+ """Get the score for a specific benchmark given the training step.
131
+ Returns a float or None if invalid.
132
+ """
133
+ calculator = BENCHMARK_CALCULATORS.get(benchmark_name)
134
+ if calculator is None:
135
+ return None
136
+ return calculator(step_value)