benchmark_results.json · OpenTrouter/Troviku-1.1 at refs/pr/4

Troviku-1.1 / benchmark_results.json

Create benchmark_results.json

8697525 verified 6 months ago

7.95 kB

	{
	"model": "OpenTrouter/Troviku-1.1",
	"version": "1.1.0",
	"evaluation_date": "2025-01-15",
	"evaluator": "OpenTrouter Research Team",

	"benchmarks": {
	"humaneval": {
	"name": "HumanEval",
	"description": "Hand-written programming problems for evaluating functional correctness",
	"total_problems": 164,
	"metrics": {
	"pass_at_1": 0.72,
	"pass_at_10": 0.89,
	"pass_at_100": 0.94
	},
	"language": "python",
	"temperature": 0.8,
	"evaluation_notes": "Evaluated using exact match on test cases"
	},

	"mbpp": {
	"name": "MBPP",
	"description": "Mostly Basic Python Problems - crowd-sourced programming problems",
	"total_problems": 500,
	"metrics": {
	"pass_at_1": 0.68,
	"pass_at_10": 0.84,
	"pass_at_100": 0.91
	},
	"language": "python",
	"temperature": 0.8,
	"evaluation_notes": "Sanitized version used for evaluation"
	},

	"code_contests": {
	"name": "CodeContests",
	"description": "Competitive programming problems from various platforms",
	"total_problems": 165,
	"metrics": {
	"pass_at_1": 0.45,
	"pass_at_10": 0.68,
	"pass_at_100": 0.79
	},
	"difficulty_breakdown": {
	"introductory": 0.82,
	"interview": 0.54,
	"competition": 0.28
	},
	"language": "python",
	"temperature": 0.8,
	"evaluation_notes": "Competitive programming requiring advanced algorithms"
	},

	"multipl_e": {
	"name": "MultiPL-E",
	"description": "Multi-language evaluation benchmark",
	"total_problems": 164,
	"languages_evaluated": [
	"python",
	"java",
	"javascript",
	"cpp",
	"rust",
	"go",
	"typescript",
	"php",
	"ruby",
	"swift"
	],
	"metrics_by_language": {
	"python": {
	"pass_at_1": 0.72,
	"pass_at_10": 0.88
	},
	"java": {
	"pass_at_1": 0.65,
	"pass_at_10": 0.82
	},
	"javascript": {
	"pass_at_1": 0.68,
	"pass_at_10": 0.85
	},
	"cpp": {
	"pass_at_1": 0.61,
	"pass_at_10": 0.78
	},
	"rust": {
	"pass_at_1": 0.58,
	"pass_at_10": 0.75
	},
	"go": {
	"pass_at_1": 0.64,
	"pass_at_10": 0.80
	},
	"typescript": {
	"pass_at_1": 0.67,
	"pass_at_10": 0.84
	},
	"php": {
	"pass_at_1": 0.60,
	"pass_at_10": 0.76
	},
	"ruby": {
	"pass_at_1": 0.59,
	"pass_at_10": 0.74
	},
	"swift": {
	"pass_at_1": 0.56,
	"pass_at_10": 0.72
	}
	},
	"temperature": 0.8,
	"evaluation_notes": "Cross-language code generation capability"
	},

	"ds_1000": {
	"name": "DS-1000",
	"description": "Data science code generation with 1000 problems",
	"total_problems": 1000,
	"metrics": {
	"pass_at_1": 0.58,
	"pass_at_10": 0.76
	},
	"libraries_covered": [
	"numpy",
	"pandas",
	"scikit-learn",
	"matplotlib",
	"scipy",
	"pytorch",
	"tensorflow"
	],
	"metrics_by_library": {
	"numpy": 0.64,
	"pandas": 0.61,
	"scikit_learn": 0.55,
	"matplotlib": 0.58,
	"scipy": 0.52,
	"pytorch": 0.54,
	"tensorflow": 0.51
	},
	"language": "python",
	"temperature": 0.2,
	"evaluation_notes": "Focused on data science and ML libraries"
	}
	},

	"custom_evaluations": {
	"code_translation": {
	"description": "Translation accuracy between programming languages",
	"language_pairs_tested": 45,
	"metrics": {
	"semantic_preservation": 0.84,
	"syntax_correctness": 0.91,
	"functional_equivalence": 0.78
	},
	"top_performing_pairs": [
	{
	"source": "python",
	"target": "javascript",
	"score": 0.89
	},
	{
	"source": "javascript",
	"target": "typescript",
	"score": 0.93
	},
	{
	"source": "java",
	"target": "kotlin",
	"score": 0.87
	}
	]
	},

	"code_explanation": {
	"description": "Quality of code explanations and documentation",
	"total_samples": 200,
	"metrics": {
	"accuracy": 0.88,
	"completeness": 0.82,
	"clarity": 0.86,
	"relevance": 0.90
	},
	"complexity_levels": {
	"beginner": 0.93,
	"intermediate": 0.86,
	"advanced": 0.79
	}
	},

	"bug_detection": {
	"description": "Ability to identify and fix bugs in code",
	"total_samples": 250,
	"bug_types": {
	"syntax_errors": 0.95,
	"logic_errors": 0.73,
	"type_errors": 0.88,
	"runtime_errors": 0.81,
	"edge_cases": 0.65
	},
	"fix_success_rate": 0.76
	},

	"code_review": {
	"description": "Code review and improvement suggestions",
	"total_samples": 150,
	"categories": {
	"code_quality": 0.84,
	"security_issues": 0.78,
	"performance_optimization": 0.72,
	"best_practices": 0.86,
	"readability": 0.89
	}
	},

	"test_generation": {
	"description": "Quality of generated unit tests",
	"total_samples": 180,
	"metrics": {
	"test_coverage": 0.81,
	"edge_case_coverage": 0.68,
	"test_correctness": 0.87,
	"assertion_quality": 0.83
	}
	}
	},

	"performance_metrics": {
	"latency": {
	"p50_ms": 1250,
	"p95_ms": 3100,
	"p99_ms": 5200
	},
	"throughput": {
	"tokens_per_second": 45,
	"requests_per_minute": 60
	},
	"resource_utilization": {
	"average_gpu_memory_gb": 18,
	"peak_gpu_memory_gb": 22,
	"average_cpu_percent": 35
	}
	},

	"comparison_to_baselines": {
	"models": [
	{
	"name": "GPT-4-turbo",
	"humaneval_pass_at_1": 0.84,
	"mbpp_pass_at_1": 0.80
	},
	{
	"name": "Claude-3.5-Sonnet",
	"humaneval_pass_at_1": 0.82,
	"mbpp_pass_at_1": 0.78
	},
	{
	"name": "CodeLlama-34B",
	"humaneval_pass_at_1": 0.68,
	"mbpp_pass_at_1": 0.62
	},
	{
	"name": "Troviku-1.1",
	"humaneval_pass_at_1": 0.72,
	"mbpp_pass_at_1": 0.68
	}
	],
	"notes": "Troviku-1.1 shows competitive performance with specialized optimization for coding tasks"
	},

	"limitations": {
	"known_weaknesses": [
	"Complex algorithmic problems requiring advanced mathematics",
	"Very large codebases with deep context dependencies",
	"Highly specialized domain-specific languages",
	"Real-time system programming with strict performance constraints",
	"Legacy code with unusual or deprecated patterns"
	],
	"improvement_areas": [
	"Enhanced multi-file context understanding",
	"Better handling of rare programming languages",
	"Improved optimization for competitive programming",
	"Stronger performance on security-critical code"
	]
	},

	"methodology": {
	"evaluation_framework": "Standard academic benchmarks with custom extensions",
	"temperature_settings": "Varied by task type (0.2-0.8)",
	"sampling_strategy": "Pass@k evaluated with k=[1, 10, 100]",
	"test_execution": "Isolated sandbox environments with timeout limits",
	"human_evaluation": "20% of outputs manually reviewed by senior engineers"
	},

	"reproducibility": {
	"random_seed": 42,
	"evaluation_code_repository": "https://github.com/OpenTrouter/troviku-eval",
	"benchmark_versions": {
	"humaneval": "v1.0",
	"mbpp": "v1.0-sanitized",
	"code_contests": "v1.0",
	"multipl_e": "v0.2"
	}
	}
	}