maris-ai-master / core-python /evals /coder_release_benchmark.json

Maris AI model sync

f440f03 verified about 1 month ago

15.2 kB

	{
	"cases": [
	{
	"name": "coder_python_execution_normalize_email",
	"message": "Uzraksti Python funkciju `normalize_email(email: str) -> str`, kas noņem atstarpes, normalizē lower-case un met ValueError tukšai ievadei.",
	"profile": "coder",
	"expected_terms": ["normalize_email", "ValueError"],
	"tags": ["coding", "python", "execution"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "standard",
	"category": "coding",
	"expects_code": true,
	"execution_language": "python",
	"execution_test_code": "assert normalize_email(' A@Example.COM ') == 'a@example.com'\ntry:\n normalize_email(' ')\nexcept ValueError:\n pass\nelse:\n raise AssertionError('expected ValueError')"
	},
	{
	"name": "coder_python_execution_parse_port",
	"message": "Uzraksti Python funkciju `parse_port(raw: str) -> int`, kas atgriež porta numuru, bet met ValueError tukšai, ne-skaitliskai vai ārpus 1..65535 ievadei.",
	"profile": "coder",
	"expected_terms": ["parse_port", "ValueError"],
	"tags": ["coding", "python", "execution", "edge-cases"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "coding",
	"expects_code": true,
	"execution_language": "python",
	"execution_test_code": "assert parse_port('8080') == 8080\nfor invalid in ('', 'abc', '0', '70000'):\n try:\n parse_port(invalid)\n except ValueError:\n pass\n else:\n raise AssertionError(f'expected ValueError for {invalid!r}')"
	},
	{
	"name": "coder_typescript_execution_next_delay",
	"message": "Uzraksti TypeScript funkciju `nextDelay(attempt: number, baseMs = 250): number`, kas atbalsta exponential backoff un attempts<=0 gadījumā atgriež 0.",
	"profile": "coder",
	"expected_terms": ["nextDelay", "attempt"],
	"tags": ["coding", "typescript", "execution"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "standard",
	"category": "coding",
	"expects_code": true,
	"execution_language": "typescript",
	"execution_test_code": "function assert(condition: boolean, message: string): void { if (!condition) throw new Error(message); }\nassert(nextDelay(0) === 0, 'attempt 0');\nassert(nextDelay(1) === 250, 'attempt 1');\nassert(nextDelay(3, 100) === 400, 'attempt 3')"
	},
	{
	"name": "coder_rust_execution_load_port",
	"message": "Uzraksti Rust funkciju `load_port(raw: &str) -> Result<u16, String>`, kas atgriež kļūdu tukšai vai nederīgai porta vērtībai un nepieļauj panic.",
	"profile": "coder",
	"expected_terms": ["Result", "u16"],
	"tags": ["coding", "rust", "execution"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "coding",
	"expects_code": true,
	"execution_language": "rust",
	"execution_test_code": "fn main() {\n assert_eq!(load_port(\"8080\").unwrap(), 8080);\n assert!(load_port(\"\").is_err());\n assert!(load_port(\"0\").is_err());\n assert!(load_port(\"abc\").is_err());\n assert!(load_port(\"70000\").is_err());\n}"
	},
	{
	"name": "coder_sql_execution_pass_rate_regression",
	"message": "Uzraksti SQL vaicājumu, kas apkopo execution pass rate pa branch un language no benchmark_results un execution_results tabulām, un iezīmē branchus zem 0.8 sliekšņa ar `is_regression` kolonnu.",
	"profile": "coder",
	"expected_terms": ["execution_pass_rate", "is_regression"],
	"tags": ["coding", "sql", "execution"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "coding",
	"expects_code": true,
	"execution_language": "sql",
	"execution_test_code": "CREATE TABLE benchmark_results (id INTEGER PRIMARY KEY, branch TEXT);\nCREATE TABLE execution_results (benchmark_run_id INTEGER, language TEXT, passed INTEGER);\nINSERT INTO benchmark_results (id, branch) VALUES (1, 'coder'), (2, 'planner');\nINSERT INTO execution_results (benchmark_run_id, language, passed) VALUES\n (1, 'typescript', 1),\n (1, 'typescript', 0),\n (1, 'rust', 1),\n (2, 'python', 1);\nCREATE TEMP TABLE actual AS {{CODE}};\nSELECT branch, language, execution_pass_rate, is_regression FROM actual;"
	},
	{
	"name": "coder_repo_patch_sse_contract",
	"message": "Balstoties uz backend-rust/src/api/chat.rs un frontend/app/chat/page.tsx, uzraksti repo-level patch plānu SSE delta/complete kontrakta salāgošanai ar drošu rollout secību.",
	"profile": "coder",
	"expected_terms": ["delta", "complete", "rollout"],
	"tags": ["coding", "repo-level", "grounding", "diff"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 2,
	"min_grounding_sources": 2,
	"expected_grounding_terms": ["backend-rust/src/api/chat.rs", "frontend/app/chat/page.tsx"]
	},
	{
	"name": "coder_repo_patch_python_bridge_failures",
	"message": "Balstoties uz backend-rust/src/inference/python_bridge.rs, piedāvā repo-level refactor patch plānu vienotam timeout/stderr/invalid JSON error modelim bez copy-paste mappinga.",
	"profile": "coder",
	"expected_terms": ["timeout", "invalid JSON", "error"],
	"tags": ["coding", "repo-level", "rust", "refactor"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 1,
	"min_grounding_sources": 1,
	"expected_grounding_terms": ["backend-rust/src/inference/python_bridge.rs"]
	},
	{
	"name": "coder_typescript_stream_event_union",
	"message": "Izveido TypeScript discriminated union helperi chat stream event payloadiem, lai UI kods compile-time līmenī atšķir `delta`, `complete` un `route` eventus.",
	"profile": "coder",
	"expected_terms": ["type", "delta", "complete"],
	"tags": ["coding", "typescript", "quality"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "standard",
	"category": "coding",
	"expects_code": true,
	"execution_language": "typescript",
	"execution_test_code": "function assert(condition: boolean, message: string): void { if (!condition) throw new Error(message); }\nconst routeEvent: ChatStreamEvent = { type: 'route', route: 'coder' };\nassert(routeEvent.type === 'route', 'route event');"
	},
	{
	"name": "coder_unsafe_pattern_repo_fix",
	"message": "Atrodi nedrošo pattern backend-rust konfigurācijas ielādē un piedāvā drošāku refactor, balstoties uz backend-rust/src/config.rs saturu.",
	"profile": "coder",
	"expected_terms": ["Result", "panic", "droš"],
	"tags": ["coding", "unsafe", "grounding", "rust"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "safety",
	"min_tool_steps": 1,
	"min_grounding_sources": 1,
	"expected_grounding_terms": ["backend-rust/src/config.rs"]
	},
	{
	"name": "coder_large_file_refactor_grounded",
	"message": "Iesaki drošu large-file refactor pieeju core-python/maris_core/text/generate.py un core-python/maris_core/text/tools.py, neizjaucot esošo grounding plūsmu.",
	"profile": "coder",
	"expected_terms": ["generate.py", "tools.py", "grounding"],
	"tags": ["coding", "large-file", "refactor", "grounding"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 2,
	"min_grounding_sources": 2,
	"expected_grounding_terms": ["core-python/maris_core/text/generate.py", "core-python/maris_core/text/tools.py"]
	},
	{
	"name": "coder_repo_sql_query_audit",
	"message": "Balstoties uz analytics/sql/query_audit.sql vai līdzīga SQL query slāņa patterniem, iesaki drošu refactor pieeju, kas aizvieto string concatenation ar parametrizētiem placeholderiem benchmark/event vaicājumiem.",
	"profile": "coder",
	"expected_terms": ["parameter", "query", "unsafe"],
	"tags": ["coding", "sql", "unsafe", "repo-level"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "safety"
	},
	{
	"name": "coder_partial_context_debugging",
	"message": "Mums tikai daļējs konteksts: tests flako ap chat stream complete event. Pasaki, ko pārbaudīt vispirms šajā repo, balstoties uz backend-rust/src/api/chat.rs un frontend/app/chat/page.tsx.",
	"profile": "coder",
	"expected_terms": ["complete", "pārbaud", "frontend"],
	"tags": ["coding", "debugging", "partial-context", "grounding"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 2,
	"min_grounding_sources": 2,
	"expected_grounding_terms": ["backend-rust/src/api/chat.rs", "frontend/app/chat/page.tsx"]
	},
	{
	"name": "coder_benchmark_history_regression_patch",
	"message": "Balstoties uz core-python/maris_core/text/benchmark.py, core-python/maris_core/training/train.py un core-python/scripts/eval_model.py, uzraksti repo-wide patch plānu benchmark history/regression tracking slānim ar artefaktiem, kas salīdzina current run pret baseline pa language un category.",
	"profile": "coder",
	"expected_terms": ["history", "regression", "category", "language"],
	"tags": ["coding", "repo-level", "diff", "grounding", "benchmark"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 3,
	"min_grounding_sources": 3,
	"expected_grounding_terms": [
	"core-python/maris_core/text/benchmark.py",
	"core-python/maris_core/training/train.py",
	"core-python/scripts/eval_model.py"
	]
	},
	{
	"name": "coder_multi_file_bugfix_complete_event_duplication",
	"message": "Balstoties uz frontend/app/chat/page.tsx un frontend/tests/chat.test.tsx, piedāvā multi-file bugfix patch, kas novērš dubultotu assistant final ziņu, kad complete event pienāk pēc pēdējā delta chunk.",
	"profile": "coder",
	"expected_terms": ["complete", "delta", "tests"],
	"tags": ["coding", "multi-file", "bugfix", "grounding", "regression-risk"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 2,
	"min_grounding_sources": 2,
	"expected_grounding_terms": ["frontend/app/chat/page.tsx", "frontend/tests/chat.test.tsx"]
	},
	{
	"name": "coder_risky_refactor_stream_contract",
	"message": "Balstoties uz backend-rust/src/api/chat.rs, frontend/app/chat/page.tsx un frontend/tests/chat.test.tsx, apraksti refactor ar regresiju riskiem stream event kontraktam, saglabājot backward-compatible rollout un delta/complete testus.",
	"profile": "coder",
	"expected_terms": ["backward-compatible", "delta", "complete", "tests"],
	"tags": ["coding", "repo-level", "refactor", "regression-risk", "grounding"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 3,
	"min_grounding_sources": 3,
	"expected_grounding_terms": [
	"backend-rust/src/api/chat.rs",
	"frontend/app/chat/page.tsx",
	"frontend/tests/chat.test.tsx"
	]
	},
	{
	"name": "coder_ci_debug_execution_benchmark_incident",
	"message": "Balstoties uz .github/workflows/core-train.yml, .github/workflows/lint-and-test.yml un core-python/scripts/eval_model.py, izveido incident-debugging patch plānu gadījumam, kad coder execution benchmarki vairs nepublicē history/regression artefaktus pēc workflow runa.",
	"profile": "coder",
	"expected_terms": ["workflow", "artifact", "history", "regression"],
	"tags": ["coding", "ci", "debugging", "incident-recovery", "grounding"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 3,
	"min_grounding_sources": 3,
	"expected_grounding_terms": [
	".github/workflows/core-train.yml",
	".github/workflows/lint-and-test.yml",
	"core-python/scripts/eval_model.py"
	]
	},
	{
	"name": "coder_python_bridge_incident_recovery",
	"message": "Balstoties uz backend-rust/src/inference/python_bridge.rs un backend-rust/src/api/chat.rs, uzraksti incident-recovery patch plānu timeout/stderr/invalid JSON degradācijas scenārijam, kur vajag ātru rollback, labāku diagnostiku un regresijas testus.",
	"profile": "coder",
	"expected_terms": ["timeout", "rollback", "diagnost", "tests"],
	"tags": ["coding", "incident-recovery", "debugging", "grounding", "repo-level"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 2,
	"min_grounding_sources": 2,
	"expected_grounding_terms": [
	"backend-rust/src/inference/python_bridge.rs",
	"backend-rust/src/api/chat.rs"
	],
	"production_like": true
	},
	{
	"name": "coder_flaky_ci_grounded_fix_plan",
	"message": "Balstoties uz .github/workflows/lint-and-test.yml, frontend/tests/chat.test.tsx un backend-rust/tests/api_tests.rs, uzraksti grounded fix plānu flaky CI scenārijam, kur chat stream complete tests izkrīt tikai GitHub Actions vidē.",
	"profile": "coder",
	"expected_terms": ["flaky", "GitHub Actions", "complete", "tests"],
	"tags": ["coding", "ci", "flaky", "grounding", "incident-recovery"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 3,
	"min_grounding_sources": 3,
	"expected_grounding_terms": [
	".github/workflows/lint-and-test.yml",
	"frontend/tests/chat.test.tsx",
	"backend-rust/tests/api_tests.rs"
	],
	"production_like": true
	},
	{
	"name": "coder_config_diff_rollback_regression_review",
	"message": "Balstoties uz huggingface/training-config.json, core-python/maris_core/training/config.py un .github/workflows/core-train.yml, piedāvā grounded patch plānu config diff + rollback scenārijam, kur benchmark gate pēc release workflow vairs neizmanto branch-specific suite.",
	"profile": "coder",
	"expected_terms": ["config", "rollback", "branch-specific", "benchmark"],
	"tags": ["coding", "config-diff", "rollback", "grounding", "benchmark"],
	"branches": ["coder"],
	"level": "release",
	"difficulty": "hard",
	"category": "grounding",
	"min_tool_steps": 3,
	"min_grounding_sources": 3,
	"expected_grounding_terms": [
	"huggingface/training-config.json",
	"core-python/maris_core/training/config.py",
	".github/workflows/core-train.yml"
	],
	"production_like": true
	}
	]
	}