| | |
| | |
| |
|
| | def calculate_math_reasoning_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 100.0 |
| | score = 0.3 + 0.5 * (1 - 1/(1 + 0.1*x)) |
| | return round(min(score, 0.95), 3) |
| |
|
| | def calculate_code_generation_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 200.0 |
| | score = 0.35 + 0.45 * (1 - 1/(1 + 0.08*x)) |
| | return round(min(score, 0.92), 3) |
| |
|
| | def calculate_text_classification_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 150.0 |
| | score = 0.4 + 0.4 * (1 - 1/(1 + 0.05*x)) |
| | return round(min(score, 0.92), 3) |
| |
|
| | def calculate_sentiment_analysis_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 120.0 |
| | score = 0.38 + 0.42 * (1 - 1/(1 + 0.04*x)) |
| | return round(min(score, 0.92), 3) |
| |
|
| | def calculate_question_answering_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 130.0 |
| | score = 0.33 + 0.48 * (1 - 1/(1 + 0.06*x)) |
| | return round(min(score, 0.95), 3) |
| |
|
| | def calculate_logical_reasoning_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 110.0 |
| | score = 0.42 + 0.4 * (1 - 1/(1 + 0.07*x)) |
| | return round(min(score, 0.95), 3) |
| |
|
| | def calculate_common_sense_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 140.0 |
| | score = 0.34 + 0.38 * (1 - 1/(1 + 0.05*x)) |
| | return round(min(score, 0.9), 3) |
| |
|
| | def calculate_reading_comprehension_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 160.0 |
| | score = 0.36 + 0.39 * (1 - 1/(1 + 0.045*x)) |
| | return round(min(score, 0.9), 3) |
| |
|
| | def calculate_dialogue_generation_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 170.0 |
| | score = 0.31 + 0.45 * (1 - 1/(1 + 0.05*x)) |
| | return round(min(score, 0.9), 3) |
| |
|
| | def calculate_summarization_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 180.0 |
| | score = 0.45 + 0.35 * (1 - 1/(1 + 0.03*x)) |
| | return round(min(score, 0.9), 3) |
| |
|
| | def calculate_translation_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 190.0 |
| | score = 0.5 + 0.3 * (1 - 1/(1 + 0.02*x)) |
| | return round(min(score, 0.9), 3) |
| |
|
| | def calculate_knowledge_retrieval_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 125.0 |
| | score = 0.3 + 0.35 * (1 - 1/(1 + 0.04*x)) |
| | return round(min(score, 0.9), 3) |
| |
|
| | def calculate_creative_writing_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 115.0 |
| | score = 0.28 + 0.45 * (1 - 1/(1 + 0.06*x)) |
| | return round(min(score, 0.9), 3) |
| |
|
| | def calculate_instruction_following_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 135.0 |
| | score = 0.37 + 0.44 * (1 - 1/(1 + 0.05*x)) |
| | return round(min(score, 0.95), 3) |
| |
|
| | def calculate_safety_evaluation_score(step_value): |
| | if not isinstance(step_value, int) or step_value <= 0: |
| | return None |
| | x = step_value / 145.0 |
| | score = 0.32 + 0.39 * (1 - 1/(1 + 0.04*x)) |
| | return round(min(score, 0.95), 3) |
| |
|
| | |
| | BENCHMARK_CALCULATORS = { |
| | "math_reasoning": calculate_math_reasoning_score, |
| | "logical_reasoning": calculate_logical_reasoning_score, |
| | "code_generation": calculate_code_generation_score, |
| | "question_answering": calculate_question_answering_score, |
| | "reading_comprehension": calculate_reading_comprehension_score, |
| | "common_sense": calculate_common_sense_score, |
| | "text_classification": calculate_text_classification_score, |
| | "sentiment_analysis": calculate_sentiment_analysis_score, |
| | "dialogue_generation": calculate_dialogue_generation_score, |
| | "summarization": calculate_summarization_score, |
| | "translation": calculate_translation_score, |
| | "knowledge_retrieval": calculate_knowledge_retrieval_score, |
| | "creative_writing": calculate_creative_writing_score, |
| | "instruction_following": calculate_instruction_following_score, |
| | "safety_evaluation": calculate_safety_evaluation_score, |
| | } |
| |
|
| |
|
| | def get_benchmark_score(benchmark_name, step_value): |
| | """Get the score for a specific benchmark given the training step. |
| | Returns a float or None if invalid. |
| | """ |
| | calculator = BENCHMARK_CALCULATORS.get(benchmark_name) |
| | if calculator is None: |
| | return None |
| | return calculator(step_value) |
| |
|