AMA-bench-Leaderboard / data /qa_distribution.json
NorahYujieZhao
refine the model scope
fb8b1d9
{
"metadata": {
"description": "Distribution of QA pairs across domains and problem types",
"total_episodes": 208,
"total_qa_pairs": 2496,
"problem_type_mapping": {
"A": "Recall",
"B": "Causal Inference",
"C": "State Updating",
"D": "State Abstraction"
}
},
"overall_distribution": {
"total_qa_pairs": 2496,
"problem_types": {
"A": {
"name": "Recall",
"count": 839,
"ratio": 0.336058
},
"B": {
"name": "Causal Inference",
"count": 596,
"ratio": 0.238782
},
"C": {
"name": "State Updating",
"count": 647,
"ratio": 0.259296
},
"D": {
"name": "State Abstraction",
"count": 414,
"ratio": 0.165865
}
}
},
"domain_distribution": {
"TEXT2SQL": {
"total_episodes": 51,
"episode_ratio": 0.245192,
"total_qa_pairs": 612,
"qa_ratio": 0.245192,
"problem_types": {
"A": {
"name": "Recall",
"count": 223,
"ratio_in_domain": 0.364379,
"ratio_overall": 0.089344
},
"B": {
"name": "Causal Inference",
"count": 153,
"ratio_in_domain": 0.250000,
"ratio_overall": 0.061298
},
"C": {
"name": "State Updating",
"count": 134,
"ratio_in_domain": 0.218954,
"ratio_overall": 0.053686
},
"D": {
"name": "State Abstraction",
"count": 102,
"ratio_in_domain": 0.166667,
"ratio_overall": 0.040865
}
}
},
"SOFTWARE": {
"total_episodes": 36,
"episode_ratio": 0.173077,
"total_qa_pairs": 432,
"qa_ratio": 0.173077,
"problem_types": {
"A": {
"name": "Recall",
"count": 212,
"ratio_in_domain": 0.490741,
"ratio_overall": 0.084936
},
"B": {
"name": "Causal Inference",
"count": 75,
"ratio_in_domain": 0.173611,
"ratio_overall": 0.030048
},
"C": {
"name": "State Updating",
"count": 73,
"ratio_in_domain": 0.168981,
"ratio_overall": 0.029247
},
"D": {
"name": "State Abstraction",
"count": 72,
"ratio_in_domain": 0.166667,
"ratio_overall": 0.028846
}
}
},
"WEB": {
"total_episodes": 31,
"episode_ratio": 0.149038,
"total_qa_pairs": 372,
"qa_ratio": 0.149038,
"problem_types": {
"A": {
"name": "Recall",
"count": 125,
"ratio_in_domain": 0.336022,
"ratio_overall": 0.050080
},
"B": {
"name": "Causal Inference",
"count": 93,
"ratio_in_domain": 0.250000,
"ratio_overall": 0.037260
},
"C": {
"name": "State Updating",
"count": 93,
"ratio_in_domain": 0.250000,
"ratio_overall": 0.037260
},
"D": {
"name": "State Abstraction",
"count": 61,
"ratio_in_domain": 0.163978,
"ratio_overall": 0.024439
}
}
},
"GAME": {
"total_episodes": 30,
"episode_ratio": 0.144231,
"total_qa_pairs": 360,
"qa_ratio": 0.144231,
"problem_types": {
"A": {
"name": "Recall",
"count": 120,
"ratio_in_domain": 0.333333,
"ratio_overall": 0.048077
},
"B": {
"name": "Causal Inference",
"count": 90,
"ratio_in_domain": 0.250000,
"ratio_overall": 0.036058
},
"C": {
"name": "State Updating",
"count": 90,
"ratio_in_domain": 0.250000,
"ratio_overall": 0.036058
},
"D": {
"name": "State Abstraction",
"count": 60,
"ratio_in_domain": 0.166667,
"ratio_overall": 0.024038
}
}
},
"EMBODIED_AI": {
"total_episodes": 30,
"episode_ratio": 0.144231,
"total_qa_pairs": 360,
"qa_ratio": 0.144231,
"problem_types": {
"A": {
"name": "Recall",
"count": 61,
"ratio_in_domain": 0.169444,
"ratio_overall": 0.024439
},
"B": {
"name": "Causal Inference",
"count": 90,
"ratio_in_domain": 0.250000,
"ratio_overall": 0.036058
},
"C": {
"name": "State Updating",
"count": 150,
"ratio_in_domain": 0.416667,
"ratio_overall": 0.060096
},
"D": {
"name": "State Abstraction",
"count": 59,
"ratio_in_domain": 0.163889,
"ratio_overall": 0.023638
}
}
},
"OPENWORLD_QA": {
"total_episodes": 30,
"episode_ratio": 0.144231,
"total_qa_pairs": 360,
"qa_ratio": 0.144231,
"problem_types": {
"A": {
"name": "Recall",
"count": 98,
"ratio_in_domain": 0.272222,
"ratio_overall": 0.039263
},
"B": {
"name": "Causal Inference",
"count": 95,
"ratio_in_domain": 0.263889,
"ratio_overall": 0.038062
},
"C": {
"name": "State Updating",
"count": 107,
"ratio_in_domain": 0.297222,
"ratio_overall": 0.042868
},
"D": {
"name": "State Abstraction",
"count": 60,
"ratio_in_domain": 0.166667,
"ratio_overall": 0.024038
}
}
}
}
}