Kimi-K2.6-NVFP4 / every_eval_ever /swebench_lite_dev.json
nm-research's picture
Add BFCL v4 and SWE-bench Lite EveryEvalEver results
ce29fe8 verified
Raw
History Blame Contribute Delete
5.64 kB
{
"schema_version": "0.2.2",
"evaluation_id": "swebench_lite_dev/RedHatAI/Kimi-K2.6-NVFP4/1782913700",
"retrieved_timestamp": "1782913700",
"source_metadata": {
"source_name": "mini-swe-agent",
"source_type": "evaluation_run",
"source_organization_name": "RedHatAI",
"evaluator_relationship": "third_party",
"additional_details": {
"note": "submission-rate aggregation across reruns"
}
},
"eval_library": {
"name": "mini-swe-agent",
"version": "2.4.3"
},
"model_info": {
"name": "RedHatAI/Kimi-K2.6-NVFP4",
"id": "RedHatAI/Kimi-K2.6-NVFP4",
"developer": "RedHatAI",
"inference_engine": {
"name": "vllm",
"version": "0.22.1"
},
"additional_details": {
"profile": "kimi_nvfp4",
"subset": "lite",
"split": "dev"
}
},
"evaluation_results": [
{
"evaluation_name": "swebench_lite_dev/submission_rate",
"source_data": {
"dataset_name": "SWE-bench Lite dev",
"source_type": "hf_dataset",
"hf_repo": "princeton-nlp/SWE-Bench_Lite",
"hf_split": "dev",
"samples_number": 23,
"sample_ids": [
"marshmallow-code__marshmallow-1343",
"marshmallow-code__marshmallow-1359",
"pvlib__pvlib-python-1072",
"pvlib__pvlib-python-1154",
"pvlib__pvlib-python-1606",
"pvlib__pvlib-python-1707",
"pvlib__pvlib-python-1854",
"pydicom__pydicom-1139",
"pydicom__pydicom-1256",
"pydicom__pydicom-1413",
"pydicom__pydicom-1694",
"pydicom__pydicom-901",
"pylint-dev__astroid-1196",
"pylint-dev__astroid-1268",
"pylint-dev__astroid-1333",
"pylint-dev__astroid-1866",
"pylint-dev__astroid-1978",
"pyvista__pyvista-4315",
"sqlfluff__sqlfluff-1517",
"sqlfluff__sqlfluff-1625",
"sqlfluff__sqlfluff-1733",
"sqlfluff__sqlfluff-1763",
"sqlfluff__sqlfluff-2419"
]
},
"metric_config": {
"evaluation_description": "Submission rate (Submitted / total instances).",
"lower_is_better": false,
"score_type": "continuous",
"min_score": 0.0,
"max_score": 1.0
},
"score_details": {
"score": 0.9130434782608695,
"details": {
"total_instances": "23",
"submitted_instances": "21",
"unsubmitted_instances": "2",
"status_counts": "{\"RepeatedFormatError\": 2, \"Submitted\": 21}",
"unsubmitted_ids": "[\"sqlfluff__sqlfluff-1625\", \"sqlfluff__sqlfluff-1733\"]",
"source_exit_status_files": "[\"/home/shubhra/kimik2.6_evals/runs/swebench/20260629T171225Z_kimi_nvfp4_lite_dev/output/exit_statuses_1782760165.6381476.yaml\", \"/home/shubhra/kimik2.6_evals/runs/swebench/20260630T135435Z_kimi_nvfp4_lite_dev/output/exit_statuses_1782827902.5985954.yaml\", \"/home/shubhra/kimik2.6_evals/runs/swebench/20260630T141107Z_kimi_nvfp4_lite_dev/output/exit_statuses_1782830121.1086848.yaml\", \"/home/shubhra/kimik2.6_evals/runs/swebench/20260630T151900Z_kimi_nvfp4_lite_dev/output/exit_statuses_1782833877.9161725.yaml\", \"/home/shubhra/kimik2.6_evals/runs/swebench/20260630T190648Z_kimi_nvfp4_lite_dev/output/exit_statuses_1782846629.8118455.yaml\", \"/home/shubhra/kimik2.6_evals/runs/swebench/20260630T192452Z_kimi_nvfp4_lite_dev/output/exit_statuses_1782848479.344266.yaml\", \"/home/shubhra/kimik2.6_evals/runs/swebench/20260630T195651Z_kimi_nvfp4_lite_dev/output/exit_statuses_1782850223.0775344.yaml\"]"
},
"completed_ids": [
"marshmallow-code__marshmallow-1343",
"marshmallow-code__marshmallow-1359",
"pvlib__pvlib-python-1072",
"pvlib__pvlib-python-1154",
"pvlib__pvlib-python-1606",
"pvlib__pvlib-python-1707",
"pvlib__pvlib-python-1854",
"pydicom__pydicom-1139",
"pydicom__pydicom-1256",
"pydicom__pydicom-1413",
"pydicom__pydicom-1694",
"pydicom__pydicom-901",
"pylint-dev__astroid-1196",
"pylint-dev__astroid-1268",
"pylint-dev__astroid-1333",
"pylint-dev__astroid-1866",
"pylint-dev__astroid-1978",
"pyvista__pyvista-4315",
"sqlfluff__sqlfluff-1517",
"sqlfluff__sqlfluff-1625",
"sqlfluff__sqlfluff-1733",
"sqlfluff__sqlfluff-1763",
"sqlfluff__sqlfluff-2419"
],
"submitted_ids": [
"marshmallow-code__marshmallow-1343",
"marshmallow-code__marshmallow-1359",
"pvlib__pvlib-python-1072",
"pvlib__pvlib-python-1154",
"pvlib__pvlib-python-1606",
"pvlib__pvlib-python-1707",
"pvlib__pvlib-python-1854",
"pydicom__pydicom-1139",
"pydicom__pydicom-1256",
"pydicom__pydicom-1413",
"pydicom__pydicom-1694",
"pydicom__pydicom-901",
"pylint-dev__astroid-1196",
"pylint-dev__astroid-1268",
"pylint-dev__astroid-1333",
"pylint-dev__astroid-1866",
"pylint-dev__astroid-1978",
"pyvista__pyvista-4315",
"sqlfluff__sqlfluff-1517",
"sqlfluff__sqlfluff-1763",
"sqlfluff__sqlfluff-2419"
]
},
"generation_config": {
"generation_args": {
"agentic_eval_config": {
"available_tools": [
{
"name": "bash"
}
]
},
"max_attempts": 1
}
}
}
]
}