maris-ai-master / core-python /evals /coder_execution_benchmark.json
MarisUK's picture
Maris AI model sync
f440f03 verified
{
"cases": [
{
"name": "coder_python_execution_normalize_email",
"message": "Uzraksti Python funkciju `normalize_email(email: str) -> str`, kas noņem atstarpes, normalizē lower-case un met ValueError tukšai ievadei.",
"profile": "coder",
"expected_terms": ["normalize_email", "ValueError"],
"tags": ["coding", "python", "execution"],
"branches": ["coder"],
"level": "ci",
"difficulty": "standard",
"category": "coding",
"expects_code": true,
"execution_language": "python",
"execution_test_code": "assert normalize_email(' A@Example.COM ') == 'a@example.com'\ntry:\n normalize_email(' ')\nexcept ValueError:\n pass\nelse:\n raise AssertionError('expected ValueError')"
},
{
"name": "coder_typescript_execution_next_delay",
"message": "Uzraksti TypeScript funkciju `nextDelay(attempt: number, baseMs = 250): number`, kas atbalsta exponential backoff un attempts<=0 gadījumā atgriež 0.",
"profile": "coder",
"expected_terms": ["nextDelay", "attempt"],
"tags": ["coding", "typescript", "execution"],
"branches": ["coder"],
"level": "ci",
"difficulty": "standard",
"category": "coding",
"expects_code": true,
"execution_language": "typescript",
"execution_test_code": "function assert(condition: boolean, message: string): void { if (!condition) throw new Error(message); }\nassert(nextDelay(0) === 0, 'attempt 0');\nassert(nextDelay(1) === 250, 'attempt 1');\nassert(nextDelay(3, 100) === 400, 'attempt 3')"
},
{
"name": "coder_rust_execution_load_port",
"message": "Uzraksti Rust funkciju `load_port(raw: &str) -> Result<u16, String>`, kas atgriež kļūdu tukšai vai nederīgai porta vērtībai un nepieļauj panic.",
"profile": "coder",
"expected_terms": ["Result", "u16"],
"tags": ["coding", "rust", "execution"],
"branches": ["coder"],
"level": "ci",
"difficulty": "hard",
"category": "coding",
"expects_code": true,
"execution_language": "rust",
"execution_test_code": "fn main() {\n assert_eq!(load_port(\"8080\").unwrap(), 8080);\n assert!(load_port(\"\").is_err());\n assert!(load_port(\"0\").is_err());\n assert!(load_port(\"abc\").is_err());\n}"
},
{
"name": "coder_sql_execution_pass_rate_regression",
"message": "Uzraksti SQL vaicājumu, kas apkopo execution pass rate pa branch un language no benchmark_results un execution_results tabulām, un iezīmē branchus zem 0.8 sliekšņa ar `is_regression` kolonnu.",
"profile": "coder",
"expected_terms": ["execution_pass_rate", "is_regression"],
"tags": ["coding", "sql", "execution"],
"branches": ["coder"],
"level": "ci",
"difficulty": "hard",
"category": "coding",
"expects_code": true,
"execution_language": "sql",
"execution_test_code": "CREATE TABLE benchmark_results (id INTEGER PRIMARY KEY, branch TEXT);\nCREATE TABLE execution_results (benchmark_run_id INTEGER, language TEXT, passed INTEGER);\nINSERT INTO benchmark_results (id, branch) VALUES (1, 'coder');\nINSERT INTO execution_results (benchmark_run_id, language, passed) VALUES (1, 'typescript', 1), (1, 'typescript', 0), (1, 'rust', 1);\nCREATE TEMP TABLE actual AS {{CODE}};\nSELECT branch, language, execution_pass_rate, is_regression FROM actual;"
}
]
}