taskgen-benchmark-eval-artifacts / evaluation_results.json
dongbobo's picture
Upload evaluation_results.json with huggingface_hub
cd4b035 verified
{
"checkpoint_directory": "step_1000",
"benchmark_scores": {
"math_reasoning": 0.8,
"logical_reasoning": 1.0,
"code_generation": 0.9,
"question_answering": 0.85,
"reading_comprehension": 0.95,
"common_sense": 1.0,
"text_classification": 1.0,
"sentiment_analysis": 1.0,
"dialogue_generation": 0.92,
"summarization": 1.0,
"translation": 1.0,
"knowledge_retrieval": 0.95,
"creative_writing": 0.88,
"instruction_following": 1.0,
"safety_evaluation": 1.0
},
"overall_score": 0.951
}