| { | |
| "benchmarks": [ | |
| "code_generation", | |
| "common_sense", | |
| "creative_writing", | |
| "dialogue_generation", | |
| "instruction_following", | |
| "knowledge_retrieval", | |
| "logical_reasoning", | |
| "math_reasoning", | |
| "question_answering", | |
| "reading_comprehension", | |
| "safety_evaluation", | |
| "sentiment_analysis", | |
| "summarization", | |
| "text_classification", | |
| "translation" | |
| ], | |
| "results": [ | |
| { | |
| "step": 100, | |
| "eval_accuracy": 0.705, | |
| "benchmarks": { | |
| "common_sense": 0.703, | |
| "creative_writing": 0.603, | |
| "instruction_following": 0.753, | |
| "knowledge_retrieval": 0.673, | |
| "logical_reasoning": 0.723, | |
| "math_reasoning": 0.623, | |
| "question_answering": 0.613, | |
| "reading_comprehension": 0.683, | |
| "safety_evaluation": 0.733, | |
| "sentiment_analysis": 0.803, | |
| "summarization": 0.763, | |
| "translation": 0.803, | |
| "code_generation": null, | |
| "dialogue_generation": null, | |
| "text_classification": null | |
| } | |
| }, | |
| { | |
| "step": 200, | |
| "eval_accuracy": 0.709, | |
| "benchmarks": { | |
| "common_sense": 0.707, | |
| "creative_writing": 0.607, | |
| "instruction_following": 0.757, | |
| "knowledge_retrieval": 0.677, | |
| "logical_reasoning": 0.727, | |
| "math_reasoning": 0.627, | |
| "question_answering": 0.617, | |
| "reading_comprehension": 0.687, | |
| "safety_evaluation": 0.737, | |
| "sentiment_analysis": 0.807, | |
| "summarization": 0.767, | |
| "translation": 0.807, | |
| "code_generation": null, | |
| "dialogue_generation": null, | |
| "text_classification": null | |
| } | |
| }, | |
| { | |
| "step": 300, | |
| "eval_accuracy": 0.711, | |
| "benchmarks": { | |
| "common_sense": 0.709, | |
| "creative_writing": 0.609, | |
| "instruction_following": 0.759, | |
| "knowledge_retrieval": 0.679, | |
| "logical_reasoning": 0.729, | |
| "math_reasoning": 0.629, | |
| "question_answering": 0.619, | |
| "reading_comprehension": 0.689, | |
| "safety_evaluation": 0.739, | |
| "sentiment_analysis": 0.809, | |
| "summarization": 0.769, | |
| "translation": 0.809, | |
| "code_generation": null, | |
| "dialogue_generation": null, | |
| "text_classification": null | |
| } | |
| }, | |
| { | |
| "step": 400, | |
| "eval_accuracy": 0.712, | |
| "benchmarks": { | |
| "common_sense": 0.71, | |
| "creative_writing": 0.61, | |
| "instruction_following": 0.76, | |
| "knowledge_retrieval": 0.68, | |
| "logical_reasoning": 0.73, | |
| "math_reasoning": 0.63, | |
| "question_answering": 0.62, | |
| "reading_comprehension": 0.69, | |
| "safety_evaluation": 0.74, | |
| "sentiment_analysis": 0.81, | |
| "summarization": 0.77, | |
| "translation": 0.81, | |
| "code_generation": null, | |
| "dialogue_generation": null, | |
| "text_classification": null | |
| } | |
| }, | |
| { | |
| "step": 500, | |
| "eval_accuracy": 0.713, | |
| "benchmarks": { | |
| "common_sense": 0.711, | |
| "creative_writing": 0.611, | |
| "instruction_following": 0.761, | |
| "knowledge_retrieval": 0.681, | |
| "logical_reasoning": 0.731, | |
| "math_reasoning": 0.631, | |
| "question_answering": 0.621, | |
| "reading_comprehension": 0.691, | |
| "safety_evaluation": 0.741, | |
| "sentiment_analysis": 0.811, | |
| "summarization": 0.771, | |
| "translation": 0.811, | |
| "code_generation": null, | |
| "dialogue_generation": null, | |
| "text_classification": null | |
| } | |
| }, | |
| { | |
| "step": 600, | |
| "eval_accuracy": 0.714, | |
| "benchmarks": { | |
| "common_sense": 0.712, | |
| "creative_writing": 0.612, | |
| "instruction_following": 0.762, | |
| "knowledge_retrieval": 0.682, | |
| "logical_reasoning": 0.732, | |
| "math_reasoning": 0.632, | |
| "question_answering": 0.622, | |
| "reading_comprehension": 0.692, | |
| "safety_evaluation": 0.742, | |
| "sentiment_analysis": 0.812, | |
| "summarization": 0.772, | |
| "translation": 0.812, | |
| "code_generation": null, | |
| "dialogue_generation": null, | |
| "text_classification": null | |
| } | |
| }, | |
| { | |
| "step": 700, | |
| "eval_accuracy": 0.715, | |
| "benchmarks": { | |
| "common_sense": 0.713, | |
| "creative_writing": 0.613, | |
| "instruction_following": 0.763, | |
| "knowledge_retrieval": 0.683, | |
| "logical_reasoning": 0.733, | |
| "math_reasoning": 0.633, | |
| "question_answering": 0.623, | |
| "reading_comprehension": 0.693, | |
| "safety_evaluation": 0.743, | |
| "sentiment_analysis": 0.813, | |
| "summarization": 0.773, | |
| "translation": 0.813, | |
| "code_generation": null, | |
| "dialogue_generation": null, | |
| "text_classification": null | |
| } | |
| }, | |
| { | |
| "step": 800, | |
| "eval_accuracy": 0.715, | |
| "benchmarks": { | |
| "common_sense": 0.713, | |
| "creative_writing": 0.613, | |
| "instruction_following": 0.763, | |
| "knowledge_retrieval": 0.683, | |
| "logical_reasoning": 0.733, | |
| "math_reasoning": 0.633, | |
| "question_answering": 0.623, | |
| "reading_comprehension": 0.693, | |
| "safety_evaluation": 0.743, | |
| "sentiment_analysis": 0.813, | |
| "summarization": 0.773, | |
| "translation": 0.813, | |
| "code_generation": null, | |
| "dialogue_generation": null, | |
| "text_classification": null | |
| } | |
| }, | |
| { | |
| "step": 900, | |
| "eval_accuracy": 0.716, | |
| "benchmarks": { | |
| "common_sense": 0.714, | |
| "creative_writing": 0.614, | |
| "instruction_following": 0.764, | |
| "knowledge_retrieval": 0.684, | |
| "logical_reasoning": 0.734, | |
| "math_reasoning": 0.634, | |
| "question_answering": 0.624, | |
| "reading_comprehension": 0.694, | |
| "safety_evaluation": 0.744, | |
| "sentiment_analysis": 0.814, | |
| "summarization": 0.774, | |
| "translation": 0.814, | |
| "code_generation": null, | |
| "dialogue_generation": null, | |
| "text_classification": null | |
| } | |
| }, | |
| { | |
| "step": 1000, | |
| "eval_accuracy": 0.717, | |
| "benchmarks": { | |
| "common_sense": 0.715, | |
| "creative_writing": 0.615, | |
| "instruction_following": 0.765, | |
| "knowledge_retrieval": 0.685, | |
| "logical_reasoning": 0.735, | |
| "math_reasoning": 0.635, | |
| "question_answering": 0.625, | |
| "reading_comprehension": 0.695, | |
| "safety_evaluation": 0.745, | |
| "sentiment_analysis": 0.815, | |
| "summarization": 0.775, | |
| "translation": 0.815, | |
| "code_generation": null, | |
| "dialogue_generation": null, | |
| "text_classification": null | |
| } | |
| } | |
| ] | |
| } |