GameTheory-Formulator-Model / eval_results.json
Alogotron's picture
Phase 3 Formulator: QLoRA adapter (r=32, alpha=64) trained on 1,215 formulation problems
54929ea verified
{
"summary": {
"total": 20,
"valid_formulations": 20,
"valid_pct": 100.0,
"all_sections_pct": 100.0,
"all_elements_pct": 100.0,
"avg_response_length": 1821,
"avg_gen_time": 41.0
},
"results": [
{
"id": "FORM-BU-H-0131",
"domain": "business",
"difficulty": "hard",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1873,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 40.8
},
{
"id": "FORM-SO-E-0105",
"domain": "social",
"difficulty": "easy",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1877,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 38.0
},
{
"id": "FORM-BU-M-0084",
"domain": "business",
"difficulty": "medium",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1740,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 51.5
},
{
"id": "FORM-BU-M-0174",
"domain": "business",
"difficulty": "medium",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1863,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 49.8
},
{
"id": "FORM-PO-E-0138",
"domain": "politics",
"difficulty": "easy",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1652,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 37.0
},
{
"id": "FORM-SE-M-0019",
"domain": "security",
"difficulty": "medium",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1966,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 41.4
},
{
"id": "FORM-AU-H-0184",
"domain": "auctions",
"difficulty": "hard",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 2218,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 41.9
},
{
"id": "FORM-SE-H-0066",
"domain": "security",
"difficulty": "hard",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1763,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 36.4
},
{
"id": "FORM-AU-E-0041",
"domain": "auctions",
"difficulty": "easy",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 2142,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 46.8
},
{
"id": "FORM-PO-M-0006",
"domain": "politics",
"difficulty": "medium",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1611,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 30.1
},
{
"id": "FORM-BU-E-0021",
"domain": "business",
"difficulty": "easy",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1712,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 36.3
},
{
"id": "FORM-BU-H-0181",
"domain": "business",
"difficulty": "hard",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1973,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 49.7
},
{
"id": "FORM-SE-M-0016",
"domain": "security",
"difficulty": "medium",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1971,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 39.1
},
{
"id": "FORM-BU-M-0078",
"domain": "business",
"difficulty": "medium",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1450,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 44.1
},
{
"id": "FORM-SE-M-0063",
"domain": "security",
"difficulty": "medium",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1736,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 37.8
},
{
"id": "FORM-BU-M-0204",
"domain": "business",
"difficulty": "medium",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1885,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 37.9
},
{
"id": "FORM-TE-M-0135",
"domain": "technology",
"difficulty": "medium",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1485,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 32.9
},
{
"id": "FORM-SE-H-0108",
"domain": "security",
"difficulty": "hard",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1669,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 44.1
},
{
"id": "FORM-TE-H-0184",
"domain": "technology",
"difficulty": "hard",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 2043,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 45.2
},
{
"id": "FORM-BU-E-0101",
"domain": "business",
"difficulty": "easy",
"sections_found": "4/4",
"has_all_sections": true,
"elements_found": "4/4",
"has_all_elements": true,
"response_length": 1799,
"is_substantial": true,
"valid_formulation": true,
"gen_time": 38.9
}
]
}