| { |
| "model": "OpenTrouter/Troviku-1.1", |
| "version": "1.1.0", |
| "evaluation_date": "2025-01-15", |
| "evaluator": "OpenTrouter Research Team", |
| |
| "benchmarks": { |
| "humaneval": { |
| "name": "HumanEval", |
| "description": "Hand-written programming problems for evaluating functional correctness", |
| "total_problems": 164, |
| "metrics": { |
| "pass_at_1": 0.72, |
| "pass_at_10": 0.89, |
| "pass_at_100": 0.94 |
| }, |
| "language": "python", |
| "temperature": 0.8, |
| "evaluation_notes": "Evaluated using exact match on test cases" |
| }, |
| |
| "mbpp": { |
| "name": "MBPP", |
| "description": "Mostly Basic Python Problems - crowd-sourced programming problems", |
| "total_problems": 500, |
| "metrics": { |
| "pass_at_1": 0.68, |
| "pass_at_10": 0.84, |
| "pass_at_100": 0.91 |
| }, |
| "language": "python", |
| "temperature": 0.8, |
| "evaluation_notes": "Sanitized version used for evaluation" |
| }, |
| |
| "code_contests": { |
| "name": "CodeContests", |
| "description": "Competitive programming problems from various platforms", |
| "total_problems": 165, |
| "metrics": { |
| "pass_at_1": 0.45, |
| "pass_at_10": 0.68, |
| "pass_at_100": 0.79 |
| }, |
| "difficulty_breakdown": { |
| "introductory": 0.82, |
| "interview": 0.54, |
| "competition": 0.28 |
| }, |
| "language": "python", |
| "temperature": 0.8, |
| "evaluation_notes": "Competitive programming requiring advanced algorithms" |
| }, |
| |
| "multipl_e": { |
| "name": "MultiPL-E", |
| "description": "Multi-language evaluation benchmark", |
| "total_problems": 164, |
| "languages_evaluated": [ |
| "python", |
| "java", |
| "javascript", |
| "cpp", |
| "rust", |
| "go", |
| "typescript", |
| "php", |
| "ruby", |
| "swift" |
| ], |
| "metrics_by_language": { |
| "python": { |
| "pass_at_1": 0.72, |
| "pass_at_10": 0.88 |
| }, |
| "java": { |
| "pass_at_1": 0.65, |
| "pass_at_10": 0.82 |
| }, |
| "javascript": { |
| "pass_at_1": 0.68, |
| "pass_at_10": 0.85 |
| }, |
| "cpp": { |
| "pass_at_1": 0.61, |
| "pass_at_10": 0.78 |
| }, |
| "rust": { |
| "pass_at_1": 0.58, |
| "pass_at_10": 0.75 |
| }, |
| "go": { |
| "pass_at_1": 0.64, |
| "pass_at_10": 0.80 |
| }, |
| "typescript": { |
| "pass_at_1": 0.67, |
| "pass_at_10": 0.84 |
| }, |
| "php": { |
| "pass_at_1": 0.60, |
| "pass_at_10": 0.76 |
| }, |
| "ruby": { |
| "pass_at_1": 0.59, |
| "pass_at_10": 0.74 |
| }, |
| "swift": { |
| "pass_at_1": 0.56, |
| "pass_at_10": 0.72 |
| } |
| }, |
| "temperature": 0.8, |
| "evaluation_notes": "Cross-language code generation capability" |
| }, |
| |
| "ds_1000": { |
| "name": "DS-1000", |
| "description": "Data science code generation with 1000 problems", |
| "total_problems": 1000, |
| "metrics": { |
| "pass_at_1": 0.58, |
| "pass_at_10": 0.76 |
| }, |
| "libraries_covered": [ |
| "numpy", |
| "pandas", |
| "scikit-learn", |
| "matplotlib", |
| "scipy", |
| "pytorch", |
| "tensorflow" |
| ], |
| "metrics_by_library": { |
| "numpy": 0.64, |
| "pandas": 0.61, |
| "scikit_learn": 0.55, |
| "matplotlib": 0.58, |
| "scipy": 0.52, |
| "pytorch": 0.54, |
| "tensorflow": 0.51 |
| }, |
| "language": "python", |
| "temperature": 0.2, |
| "evaluation_notes": "Focused on data science and ML libraries" |
| } |
| }, |
| |
| "custom_evaluations": { |
| "code_translation": { |
| "description": "Translation accuracy between programming languages", |
| "language_pairs_tested": 45, |
| "metrics": { |
| "semantic_preservation": 0.84, |
| "syntax_correctness": 0.91, |
| "functional_equivalence": 0.78 |
| }, |
| "top_performing_pairs": [ |
| { |
| "source": "python", |
| "target": "javascript", |
| "score": 0.89 |
| }, |
| { |
| "source": "javascript", |
| "target": "typescript", |
| "score": 0.93 |
| }, |
| { |
| "source": "java", |
| "target": "kotlin", |
| "score": 0.87 |
| } |
| ] |
| }, |
| |
| "code_explanation": { |
| "description": "Quality of code explanations and documentation", |
| "total_samples": 200, |
| "metrics": { |
| "accuracy": 0.88, |
| "completeness": 0.82, |
| "clarity": 0.86, |
| "relevance": 0.90 |
| }, |
| "complexity_levels": { |
| "beginner": 0.93, |
| "intermediate": 0.86, |
| "advanced": 0.79 |
| } |
| }, |
| |
| "bug_detection": { |
| "description": "Ability to identify and fix bugs in code", |
| "total_samples": 250, |
| "bug_types": { |
| "syntax_errors": 0.95, |
| "logic_errors": 0.73, |
| "type_errors": 0.88, |
| "runtime_errors": 0.81, |
| "edge_cases": 0.65 |
| }, |
| "fix_success_rate": 0.76 |
| }, |
| |
| "code_review": { |
| "description": "Code review and improvement suggestions", |
| "total_samples": 150, |
| "categories": { |
| "code_quality": 0.84, |
| "security_issues": 0.78, |
| "performance_optimization": 0.72, |
| "best_practices": 0.86, |
| "readability": 0.89 |
| } |
| }, |
| |
| "test_generation": { |
| "description": "Quality of generated unit tests", |
| "total_samples": 180, |
| "metrics": { |
| "test_coverage": 0.81, |
| "edge_case_coverage": 0.68, |
| "test_correctness": 0.87, |
| "assertion_quality": 0.83 |
| } |
| } |
| }, |
| |
| "performance_metrics": { |
| "latency": { |
| "p50_ms": 1250, |
| "p95_ms": 3100, |
| "p99_ms": 5200 |
| }, |
| "throughput": { |
| "tokens_per_second": 45, |
| "requests_per_minute": 60 |
| }, |
| "resource_utilization": { |
| "average_gpu_memory_gb": 18, |
| "peak_gpu_memory_gb": 22, |
| "average_cpu_percent": 35 |
| } |
| }, |
| |
| "comparison_to_baselines": { |
| "models": [ |
| { |
| "name": "GPT-4-turbo", |
| "humaneval_pass_at_1": 0.84, |
| "mbpp_pass_at_1": 0.80 |
| }, |
| { |
| "name": "Claude-3.5-Sonnet", |
| "humaneval_pass_at_1": 0.82, |
| "mbpp_pass_at_1": 0.78 |
| }, |
| { |
| "name": "CodeLlama-34B", |
| "humaneval_pass_at_1": 0.68, |
| "mbpp_pass_at_1": 0.62 |
| }, |
| { |
| "name": "Troviku-1.1", |
| "humaneval_pass_at_1": 0.72, |
| "mbpp_pass_at_1": 0.68 |
| } |
| ], |
| "notes": "Troviku-1.1 shows competitive performance with specialized optimization for coding tasks" |
| }, |
| |
| "limitations": { |
| "known_weaknesses": [ |
| "Complex algorithmic problems requiring advanced mathematics", |
| "Very large codebases with deep context dependencies", |
| "Highly specialized domain-specific languages", |
| "Real-time system programming with strict performance constraints", |
| "Legacy code with unusual or deprecated patterns" |
| ], |
| "improvement_areas": [ |
| "Enhanced multi-file context understanding", |
| "Better handling of rare programming languages", |
| "Improved optimization for competitive programming", |
| "Stronger performance on security-critical code" |
| ] |
| }, |
| |
| "methodology": { |
| "evaluation_framework": "Standard academic benchmarks with custom extensions", |
| "temperature_settings": "Varied by task type (0.2-0.8)", |
| "sampling_strategy": "Pass@k evaluated with k=[1, 10, 100]", |
| "test_execution": "Isolated sandbox environments with timeout limits", |
| "human_evaluation": "20% of outputs manually reviewed by senior engineers" |
| }, |
| |
| "reproducibility": { |
| "random_seed": 42, |
| "evaluation_code_repository": "https://github.com/OpenTrouter/troviku-eval", |
| "benchmark_versions": { |
| "humaneval": "v1.0", |
| "mbpp": "v1.0-sanitized", |
| "code_contests": "v1.0", |
| "multipl_e": "v0.2" |
| } |
| } |
| } |